Merge remote-tracking branch 'origin/develop' into embeddings

478df149 · fsx950223 · 8941136f · 80e05267 · 478df149 · 478df149
Commit 478df149 authored Jan 18, 2023 by fsx950223
20 changed files
--- a/client_example/15_reduce/CMakeLists.txt
+++ b/client_example/15_reduce/CMakeLists.txt
+add_executable(client_reduce_nhwc_c reduce_nhwc_c.cpp)
+target_link_libraries(client_reduce_nhwc_c PRIVATE composable_kernel::device_operations)
--- a/client_example/15_reduce/reduce_nhwc_c.cpp
+++ b/client_example/15_reduce/reduce_nhwc_c.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp"
+
+using InDataType  = float;
+using OutDataType = float;
+using AccDataType = float;
+using ReduceAdd   = ck::reduce::Add;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using UnaryDivide = ck::tensor_operation::element_wise::UnaryDivide;
+
+constexpr bool PropagateNan = false;
+constexpr bool OutputIndex  = false;
+
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, Rank> in_lengths{16, 8, 128, 256};
+    std::array<ck::index_t, Rank> in_strides{8 * 128 * 256, 128 * 256, 256, 1};
+    std::array<ck::index_t, Rank - NumReduceDim> out_lengths{256};
+    std::array<ck::index_t, Rank - NumReduceDim> out_strides{1};
+    std::array<int, NumReduceDim> reduce_dims{0, 1, 2};
+
+    ck::index_t num_in_elements =
+        std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies<ck::index_t>());
+
+    ck::index_t num_out_elements =
+        std::accumulate(out_lengths.begin(), out_lengths.end(), 1, std::multiplies<ck::index_t>());
+
+    ck::index_t reduce_length = 1;
+
+    for(auto dim : reduce_dims)
+        reduce_length *= in_lengths[dim];
+
+    float alpha{1.0f};
+    float beta{0.0f};
+
+    SimpleDeviceMem in(sizeof(InDataType) * num_in_elements);
+    SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements);
+
+    using DeviceOp     = ck::tensor_operation::device::DeviceReduce<InDataType,
+                                                                AccDataType,
+                                                                OutDataType,
+                                                                Rank,
+                                                                NumReduceDim,
+                                                                ReduceAdd,
+                                                                PassThrough,
+                                                                UnaryDivide,
+                                                                PropagateNan,
+                                                                OutputIndex>;
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
+                                                        in_strides,
+                                                        out_lengths,
+                                                        out_strides,
+                                                        reduce_dims,
+                                                        alpha,
+                                                        beta,
+                                                        in.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        out.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        PassThrough{},
+                                                        UnaryDivide{reduce_length});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes = num_in_elements * sizeof(InDataType) +
+                                    (beta == 0.0f ? 1 : 2) * num_out_elements * sizeof(OutDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    if(found)
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
+                                                        in_strides,
+                                                        out_lengths,
+                                                        out_strides,
+                                                        reduce_dims,
+                                                        alpha,
+                                                        beta,
+                                                        in.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        out.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        PassThrough{},
+                                                        UnaryDivide{reduce_length});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -35,3 +35,8 @@ add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)

 add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
 add_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
+
+add_custom_target(example_gemm_wmma)
+add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
+add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
+
--- a/example/01_gemm/gemm_wmma_fp16.cpp
+++ b/example/01_gemm/gemm_wmma_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer|MRepeat|NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    | WMMA| WMMA|       |       |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|MWmmaPerWave|NWmmaPerWave|        _MBlock_MWaveMPerWmma| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     |       |       | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|        _NBlock_NWaveNPerWmma|  _NWaveNPerWmma|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |      |      |      |      |    |     |     |       |       |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   128,   256,     8,   8,   16,   16,      4,      4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,              S<1, 32, 1,  8>,               8, 1>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -9,6 +9,7 @@
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -16,7 +17,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_reduction.hpp"

 #include "reduce_example_common.hpp"

@@ -236,38 +236,57 @@ int reduce_blockwise_impl(bool do_verification,
        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
            static_cast<int32_t>(reduce_total_length));

+    std::array<index_t, Rank> arrInLengths;
+    std::array<index_t, Rank> arrInStrides;
+    std::array<index_t, NumOutDim> arrOutLengths;
+    std::array<index_t, NumOutDim> arrOutStrides;
+
+    ck::ranges::copy(inLengths, arrInLengths.begin());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
    if(do_verification)
    {
-        ReductionHost<InOutDataType,
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
                                                        AccDataType,
                                                        InOutDataType,
+                                                        Rank,
+                                                        NumReduceDim,
                                                        ReduceOperation,
                                                        InElementwiseOperation,
                                                        AccElementwiseOperation,
-                      Rank,
-                      NumReduceDim,
                                                        PropagateNan,
-                      OutputIndex>
-            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+                                                        OutputIndex>;

-        hostReduce.Run(alpha,
-                       in.mData.data(),
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths,
+                                                               arrInStrides,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               alpha,
                                                               beta,
+                                                               in.mData.data(),
+                                                               nullptr,
                                                               out_ref.mData.data(),
                                                               out_indices_ref.mData.data(),
                                                               in_elementwise_op,
                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
        };

-    std::array<index_t, Rank> arrInLengths;
-    std::array<index_t, Rank> arrInStrides;
-    std::array<index_t, NumOutDim> arrOutLengths;
-    std::array<index_t, NumOutDim> arrOutStrides;
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();

-    ck::ranges::copy(inLengths, arrInLengths.begin());
-    ck::ranges::copy(inStrides, arrInStrides.begin());
-    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    ck::ranges::copy(outStrides, arrOutStrides.begin());
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };

    auto reduce = DeviceReduceInstance{};

@@ -287,8 +306,7 @@ int reduce_blockwise_impl(bool do_verification,

    if(!reduce.IsSupportedArgument(argument_ptr.get()))
    {
-        std::cerr
-            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+        std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!"
                  << std::endl;

        return (-2);

--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -12,13 +12,13 @@
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_reduction.hpp"

 using namespace ck;
 using namespace ck::tensor_operation::device;
@@ -98,7 +98,7 @@ int main(int argc, char* argv[])

    // used by the host reduction
    const std::array<int, 2> reduceDims = {3, 4};
-    const std::array<int, 3> invariantDims = {0, 1, 2};
+    // const std::array<int, 3> invariantDims = {0, 1, 2};

    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};

@@ -191,42 +191,61 @@ int main(int argc, char* argv[])
        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
            static_cast<int32_t>(reduce_total_length));

+    std::array<index_t, 5> arrInLengths_1;
+    std::array<index_t, 5> arrInStrides_1;
+    std::array<index_t, 4> arrInLengths_2;
+    std::array<index_t, 4> arrInStrides_2;
+    std::array<index_t, 3> arrOutLengths;
+    std::array<index_t, 3> arrOutStrides;
+
+    ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
+    ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
+    ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
+    ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
    if(do_verify)
    {
-        ReductionHost<InOutDataType,
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
                                                        AccDataType,
                                                        InOutDataType,
+                                                        5,
+                                                        2,
                                                        ReduceOperation,
                                                        InElementwiseOperation,
                                                        AccElementwiseOperation,
-                      5, // Rank
-                      2, // NumReduceDim
                                                        PropagateNan,
-                      OutputIndex>
-            hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+                                                        OutputIndex>;

-        hostReduce.Run(alpha,
-                       in_1.mData.data(),
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths_1,
+                                                               arrInStrides_1,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               alpha,
                                                               beta,
+                                                               in_1.mData.data(),
+                                                               nullptr,
                                                               out_ref.mData.data(),
                                                               nullptr,
                                                               in_elementwise_op,
                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
        };

-    std::array<index_t, 5> arrInLengths_1;
-    std::array<index_t, 5> arrInStrides_1;
-    std::array<index_t, 4> arrInLengths_2;
-    std::array<index_t, 4> arrInStrides_2;
-    std::array<index_t, 3> arrOutLengths;
-    std::array<index_t, 3> arrOutStrides;
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();

-    ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
-    ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
-    ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
-    ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
-    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    ck::ranges::copy(outStrides, arrOutStrides.begin());
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };

    auto reduce_1 = DeviceReduceInstance_1{};

@@ -246,8 +265,7 @@ int main(int argc, char* argv[])

    if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
    {
-        std::cout
-            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+        std::cout << "The runtime parameters seems supported by the DeviceReduce instance, exiting!"
                  << std::endl;
    };


--- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
@@ -9,6 +9,7 @@
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -16,7 +17,6 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_reduction.hpp"

 #include "reduce_example_common.hpp"

@@ -149,38 +149,57 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
            static_cast<int32_t>(reduce_total_length));

+    std::array<index_t, Rank> arrInLengths;
+    std::array<index_t, Rank> arrInStrides;
+    std::array<index_t, NumOutDim> arrOutLengths;
+    std::array<index_t, NumOutDim> arrOutStrides;
+
+    ck::ranges::copy(inLengths, arrInLengths.begin());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
    if(do_verification)
    {
-        ReductionHost<InOutDataType,
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
                                                        AccDataType,
                                                        InOutDataType,
+                                                        Rank,
+                                                        NumReduceDim,
                                                        ReduceOperation,
                                                        InElementwiseOperation,
                                                        AccElementwiseOperation,
-                      Rank,
-                      NumReduceDim,
                                                        PropagateNan,
-                      false>
-            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+                                                        false>;

-        hostReduce.Run(alpha,
-                       in.mData.data(),
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths,
+                                                               arrInStrides,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               alpha,
                                                               beta,
+                                                               in.mData.data(),
+                                                               nullptr,
                                                               out_ref.mData.data(),
                                                               nullptr,
                                                               in_elementwise_op,
                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
        };

-    std::array<index_t, Rank> arrInLengths;
-    std::array<index_t, Rank> arrInStrides;
-    std::array<index_t, NumOutDim> arrOutLengths;
-    std::array<index_t, NumOutDim> arrOutStrides;
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();

-    ck::ranges::copy(inLengths, arrInLengths.begin());
-    ck::ranges::copy(inStrides, arrInStrides.begin());
-    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    ck::ranges::copy(outStrides, arrOutStrides.begin());
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };

    auto reduce = DeviceReduceInstance{};

@@ -200,8 +219,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,

    if(!reduce.IsSupportedArgument(argument_ptr.get()))
    {
-        std::cerr
-            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+        std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!"
                  << std::endl;

        return (-2);

--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
-add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_fp16 gemm_bias_relu_add_layernorm_xdl_fp16.cpp)
-add_example_executable(example_gemm_layernorm_xdl_fp16 gemm_layernorm_xdl_fp16.cpp)
-add_example_executable(example_gemm_xdl_layernorm_single_kernel_fp16 gemm_xdl_layernorm_single_kernel_fp16.cpp)
+add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
+add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
+add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
+add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddReluAdd  = ck::tensor_operation::element_wise::AddReluAdd;
+
+// DataType
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F16;
+using D1DataType       = F16;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EMeanVarDataType = F16;
+using GammaDataType    = F16;
+using BetaDataType     = F16;
+using HDataType        = F16;
+
+// Layout
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using HLayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddReluAdd;
+using HElementOp   = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| HLayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EMeanVarData|     GammaData|     BetaData|     HData|           A|           B|          CDE|            H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|           PostShuffle|     PostShuffle|            Layernorm|       Layernorm|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|             Type|          Type|         Type|      Type| Elementwise| Elementwise|  Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|  ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|
+//######|        |        |         |        |          |          |            |                 |           |                 |              |             |          |   Operation|   Operation|    Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                  _M_N|            _M_N|                 _M_N|              _M|
+//######|        |        |         |        |          |          |            |                 |           |                 |              |             |          |            |            |             |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                      |                |                     |                |
+        < ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType,  AElementOp,  BElementOp, CDEElementOp,   HElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<32, 8>,               8,             S<8, 32>,               8>;
+// clang-format on
+
+auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+    return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                std::vector<std::size_t>({stride}));
+};
+
+auto f_host_tensor_descriptor2d =
+    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                        std::vector<std::size_t>({stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                        std::vector<std::size_t>({1, stride}));
+        }
+    };
+
+void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
+                         const Tensor<ADataType>& a_m_k,
+                         const Tensor<BDataType>& b_k_n,
+                         const Tensor<D0DataType>& bias_n,
+                         const Tensor<D1DataType>& d1_m_n,
+                         const Tensor<GammaDataType>& gamma_n,
+                         const Tensor<BetaDataType>& beta_n,
+                         AElementOp a_element_op,
+                         BElementOp b_element_op,
+                         CDEElementOp cde_element_op,
+                         int M,
+                         int N,
+                         AccDataType epsilon = 1e-5)
+{
+    using ReferenceGemm = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                    BDataType,
+                                                                    AccDataType,
+                                                                    AccDataType,
+                                                                    AElementOp,
+                                                                    BElementOp,
+                                                                    PassThrough>;
+
+    using ReferenceLayernorm = ck::tensor_operation::host::ReferenceLayernorm<EMeanVarDataType,
+                                                                              GammaDataType,
+                                                                              BetaDataType,
+                                                                              HDataType,
+                                                                              AccDataType,
+                                                                              HElementOp,
+                                                                              2,
+                                                                              1>;
+
+    Tensor<EMeanVarDataType> e_m_n(HostTensorDescriptor{M, N});
+    Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
+
+    auto ref_gemm         = ReferenceGemm{};
+    auto ref_gemm_invoker = ref_gemm.MakeInvoker();
+
+    auto ref_gemm_argument =
+        ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+    ref_gemm_invoker.Run(ref_gemm_argument);
+
+    for(int n = 0; n < N; ++n)
+    {
+        AccDataType bias = static_cast<AccDataType>(bias_n(n));
+        for(int m = 0; m < M; ++m)
+        {
+            AccDataType e  = static_cast<AccDataType>(e_m_n(m, n));
+            AccDataType d1 = static_cast<AccDataType>(d1_m_n(m, n));
+            cde_element_op(e, c_m_n(m, n), bias, d1);
+            e_m_n(m, n) = static_cast<EMeanVarDataType>(e);
+        }
+    }
+
+    ReferenceLayernorm ref_layernorm;
+    auto ref_layernorm_invoker = ref_layernorm.MakeInvoker();
+
+    auto ref_layernorm_argument = ref_layernorm.MakeArgument(
+        e_m_n, gamma_n, beta_n, h_m_n, HElementOp{}, {M, N}, {1}, epsilon);
+    ref_layernorm_invoker.Run(ref_layernorm_argument);
+}
+
+int main()
+{
+    bool do_verification = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA  = K;
+    ck::index_t StrideB  = K;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = N;
+    ck::index_t StrideH  = N;
+
+    float epsilon = 1e-5;
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor2d(M, N, StrideD1, D1Layout{}));
+    Tensor<GammaDataType> gamma_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<HDataType> h_m_n(f_host_tensor_descriptor2d(M, N, StrideH, HLayout{}));
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
+    b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
+    d0_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-1, 1});
+    d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-1, 1});
+    gamma_n.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-1, 1});
+    beta_n.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-1, 1});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_n.mDesc.GetElementSpaceSize());
+    DeviceMem beta_device_buf(sizeof(BetaDataType) * beta_n.mDesc.GetElementSpaceSize());
+    DeviceMem h_device_buf(sizeof(HDataType) * h_m_n.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    gamma_device_buf.ToDevice(gamma_n.mData.data());
+    beta_device_buf.ToDevice(beta_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto h_element_op   = HElementOp{};
+
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               gamma_device_buf.GetDeviceBuffer(),
+                               beta_device_buf.GetDeviceBuffer(),
+                               h_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideH,
+                               epsilon,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op,
+                               h_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    size_t workspace_sz = device_op.GetWorkSpaceSize(&argument);
+    DeviceMem workspace_dev(workspace_sz);
+    device_op.SetWorkSpacePointer(&argument, workspace_dev.GetDeviceBuffer());
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        Tensor<HDataType> h_m_n_host(HostTensorDescriptor{M, N});
+        host_gemm_layernorm(h_m_n_host,
+                            a_m_k,
+                            b_k_n,
+                            d0_n,
+                            d1_m_n,
+                            gamma_n,
+                            beta_n,
+                            a_element_op,
+                            b_element_op,
+                            cde_element_op,
+                            M,
+                            N,
+                            epsilon);
+
+        h_device_buf.FromDevice(h_m_n.mData.data());
+        pass &=
+            ck::utils::check_err(h_m_n, h_m_n_host, "Error: Incorrect results h_m_n", 1e-2, 1e-2);
+    }
+
+    return pass ? 0 : 1;
+}
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -170,6 +170,9 @@
 #define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 0
 #endif // __gfx908__

+// flag to enable (1) or disable (0) the debugging output in some kernels
+#define DEBUG_LOG 0
+
 namespace ck {

 enum struct InMemoryDataOperationEnum

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+#define CK_MNK_LOOP
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerWMMA,
+          index_t NPerWMMA,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+/* A: K0PerBlock x MPerBlock x K1
+ * B: K0PerBlock x NPerBlock x K1
+ * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs
+ * KPACK == WMMA_K = 16
+ */
+struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle
+{
+    static constexpr auto I0    = Number<0>{};
+    static constexpr auto I1    = Number<1>{};
+    static constexpr auto I2    = Number<2>{};
+    static constexpr auto I3    = Number<3>{};
+    static constexpr auto I4    = Number<4>{};
+    static constexpr auto WmmaK = Number<16>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one.
+    static constexpr index_t WaveSize = 32;
+
+    static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t KPerBlock =
+        BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr auto wmma_gemm =
+        WmmaGemm<FloatA, FloatB, FloatAcc, MPerWMMA, NPerWMMA, KPack>{};
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA);
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              FloatAcc,
+                              MRepeat * NRepeat,
+                              wmma_gemm.GetRegSizePerWmma(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex();
+        //  |KRepeat   |MRepeat|MWave      |MLane       |KPack
+        return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex();
+        //  |KRepeat   |NRepeat|Nwave      |NLane       |KPack
+        return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0);
+    }
+
+    template <index_t m0, index_t n0>
+    __device__ static auto CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk();
+
+        constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle()
+    {
+        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
+                          BK0NK1BlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 &&
+                          NPerBlock % (NPerWMMA * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    // Thread level, register decriptor. Vector-write
+    __host__ __device__ static constexpr auto
+    GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
+            wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
+
+        constexpr auto MSubGroup          = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0];
+        constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1];
+        constexpr auto MAccVgprs          = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
+
+        return make_naive_tensor_descriptor_packed(
+            //        |MRepeat           |MWave |MSubGroup |NRepeat           |NWave
+            //        |NThreadPerSubGroup |MAccVgprs
+            make_tuple(Number<MRepeat>{},
+                       I1,
+                       MSubGroup,
+                       Number<NRepeat>{},
+                       I1,
+                       NThreadPerSubGroup,
+                       MAccVgprs));
+    }
+
+    // Provide dimension size
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<MPerWMMA>{},
+                                                           Number<NRepeat>{},
+                                                           Number<NWaves>{},
+                                                           Number<NPerWMMA>{}));
+
+        return wmma_gemm
+            .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
+    }
+
+    __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1()
+    {
+        return transform_tensor_descriptor(
+            AK0MK1BlockDesc{},
+            make_tuple(make_pass_through_transform(Number<A_K0>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerWMMA>{})),
+                       make_pass_through_transform(Number<A_K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
+    }
+
+    __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1()
+    {
+        return transform_tensor_descriptor(
+            BK0NK1BlockDesc{},
+            make_tuple(make_pass_through_transform(Number<B_K0>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NWaves>{}, Number<NPerWMMA>{})),
+                       make_pass_through_transform(Number<B_K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
+    }
+
+    // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma
+    static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1();
+    static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1();
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                // read A
+                a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                                   make_tuple(Number<k * WmmaK / A_K1>{}, m0, I0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(I0, m0, I0, I0, I0),
+                                   a_thread_buf);
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read B
+                    b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                       make_tuple(Number<k * WmmaK / B_K1>{}, n0, I0, I0, I0),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(I0, n0, I0, I0, I0),
+                                       b_thread_buf);
+                    vector_type<FloatA, WmmaK> a_thread_vec;
+                    vector_type<FloatB, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto i) {
+                        a_thread_vec.template AsType<FloatA>()(i) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}];
+                        b_thread_vec.template AsType<FloatB>()(i) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}];
+                    });
+
+                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    wmma_gemm.template Run(
+                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                });
+            });
+        });
+    }
+
+    protected:
+    // A[K0, M0, M1, M2, K1]
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<WmmaK / A_K1>{}, Number<MRepeat>{}, I1, I1, Number<A_K1>{}));
+
+    // B[K0, N0, N1, N2, K1]
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<WmmaK / B_K1>{}, Number<NRepeat>{}, I1, I1, Number<B_K1>{}));
+
+    // C[M, N, NumRegWMMA]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, wmma_gemm.GetRegSizePerWmma()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         decltype(a_block_desc_k0_m0_m1_m2_k1),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<WmmaK / A_K1, 1, 1, 1, A_K1>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
+                                                         decltype(b_block_desc_k0_n0_n1_n2_k1),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<WmmaK / B_K1, 1, 1, 1, B_K1>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
+};
+
+// block wise level pipe designed for inline asm
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerWMMA,
+          index_t NPerWMMA,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+/* A: K0PerBlock x MPerBlock x K1
+ * B: K0PerBlock x NPerBlock x K1
+ * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs
+ * KPACK == WMMA_K = 16
+ */
+struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO
+{
+    static constexpr auto I0    = Number<0>{};
+    static constexpr auto I1    = Number<1>{};
+    static constexpr auto I2    = Number<2>{};
+    static constexpr auto I3    = Number<3>{};
+    static constexpr auto I4    = Number<4>{};
+    static constexpr auto WmmaK = Number<16>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one.
+    static constexpr index_t WaveSize = 32;
+
+    static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t KPerBlock =
+        BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr auto wmma_gemm =
+        WmmaGemm<FloatA, FloatB, FloatAcc, MPerWMMA, NPerWMMA, KPack>{};
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA);
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              FloatAcc,
+                              MRepeat * NRepeat,
+                              wmma_gemm.GetRegSizePerWmma(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex();
+        //  |KRepeat   |MRepeat|MWave      |MLane       |KPack
+        return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex();
+        //  |KRepeat   |NRepeat|Nwave      |NLane       |KPack
+        return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0);
+    }
+
+    template <index_t m0, index_t n0>
+    __device__ static auto CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk();
+
+        constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO()
+    {
+        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
+                          BK0NK1BlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 &&
+                          NPerBlock % (NPerWMMA * NRepeat) == 0,
+                      "wrong!");
+    }
+    // Thread level, register decriptor. Vector-write
+    __host__ __device__ static constexpr auto
+    GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
+            wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
+
+        constexpr auto MSubGroup          = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0];
+        constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1];
+        constexpr auto MAccVgprs          = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
+
+        return make_naive_tensor_descriptor_packed(
+            //        |MRepeat           |MWave |MSubGroup |NRepeat           |NWave
+            //        |NThreadPerSubGroup |MAccVgprs
+            make_tuple(Number<MRepeat>{},
+                       I1,
+                       MSubGroup,
+                       Number<NRepeat>{},
+                       I1,
+                       NThreadPerSubGroup,
+                       MAccVgprs));
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+        const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma =
+            transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(
+                    make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)),
+                    make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return wmma_gemm
+            .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma);
+    }
+
+    // Provide dimension size
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<MPerWMMA>{},
+                                                           Number<NRepeat>{},
+                                                           Number<NWaves>{},
+                                                           Number<NPerWMMA>{}));
+
+        return wmma_gemm
+            .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
+    }
+
+    __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1()
+    {
+        return transform_tensor_descriptor(
+            AK0MK1BlockDesc{},
+            make_tuple(make_pass_through_transform(Number<A_K0>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerWMMA>{})),
+                       make_pass_through_transform(Number<A_K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
+    }
+
+    __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1()
+    {
+        return transform_tensor_descriptor(
+            BK0NK1BlockDesc{},
+            make_tuple(make_pass_through_transform(Number<B_K0>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NWaves>{}, Number<NPerWMMA>{})),
+                       make_pass_through_transform(Number<B_K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
+    }
+
+    // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma
+    static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1();
+    static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1();
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        constexpr auto RepeatDiff = MRepeat - NRepeat;
+        // Read all Mrepeat, Nrepeat
+        static_for<0, NRepeat, 1>{}([&](auto iN) {
+            b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                               make_tuple(I0, Number<iN>{}, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, Number<iN>{}, I0, I0, I0),
+                               b_thread_buf);
+        });
+
+        static_for<0, MRepeat, 1>{}([&](auto iM) {
+            a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                               make_tuple(I0, Number<iM>{}, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, Number<iM>{}, I0, I0, I0),
+                               a_thread_buf);
+        });
+
+        // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat
+        static_for<0, RepeatDiff, 1>{}([&](auto iCut) {
+            static_for<0, NRepeat, 1>{}([&](auto iN) {
+                vector_type<FloatA, WmmaK> a_thread_vec;
+                vector_type<FloatB, WmmaK> b_thread_vec;
+
+                static_for<0, WmmaK, 1>{}([&](auto iK) {
+                    a_thread_vec.template AsType<FloatA>()(iK) =
+                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                            make_tuple(iK / A_K1, iCut, 0, 0, iK % A_K1))>{}];
+                    b_thread_vec.template AsType<FloatB>()(iK) =
+                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                            make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}];
+                });
+                using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                constexpr index_t c_offset =
+                    c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0));
+                // s_nop();
+                wmma_gemm.template Run(
+                    a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                    b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                // s_nop();
+            });
+            if constexpr(KPerBlock > WmmaK)
+            {
+                // Read Consumed Next inner loop A
+                a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                                   make_tuple(Number<WmmaK / A_K1>{}, Number<iCut>{}, I0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(I0, Number<iCut>{}, I0, I0, I0),
+                                   a_thread_buf);
+            }
+        });
+
+        static_for<WmmaK, KPerBlock, WmmaK>{}([&](auto iWmmaK) {
+            // Stage 2: Run FIFO fashion loopover in Square
+            static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop) {
+                // Row Repeatation
+                static_for<WmmaInnerloop, NRepeat, 1>{}([&](auto iN) {
+                    vector_type<FloatA, WmmaK> a_thread_vec;
+                    vector_type<FloatB, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto iK) {
+                        a_thread_vec.template AsType<FloatA>()(iK) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                iK / A_K1, WmmaInnerloop + RepeatDiff, 0, 0, iK % A_K1))>{}];
+                        b_thread_vec.template AsType<FloatB>()(iK) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}];
+                    });
+                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                        make_tuple(WmmaInnerloop + RepeatDiff, iN, 0));
+                    // s_nop();
+                    wmma_gemm.template Run(
+                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    // s_nop();
+                });
+
+                // Read Consumed Next inner loop A
+                a_thread_copy_.Run(
+                    a_block_desc_k0_m0_m1_m2_k1,
+                    make_tuple(
+                        Number<iWmmaK / A_K1>{}, Number<WmmaInnerloop + RepeatDiff>{}, I0, I0, I0),
+                    a_block_buf,
+                    a_thread_desc_,
+                    make_tuple(I0, Number<WmmaInnerloop + RepeatDiff>{}, I0, I0, I0),
+                    a_thread_buf);
+
+                // Col Repeatation
+                static_for<WmmaInnerloop + 1 + RepeatDiff, MRepeat, 1>{}([&](auto iM) {
+                    vector_type<FloatA, WmmaK> a_thread_vec;
+                    vector_type<FloatB, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto iK) {
+                        a_thread_vec.template AsType<FloatA>()(iK) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                make_tuple(iK / A_K1, iM, 0, 0, iK % A_K1))>{}];
+                        b_thread_vec.template AsType<FloatB>()(iK) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(iK / B_K1, WmmaInnerloop, 0, 0, iK % B_K1))>{}];
+                    });
+                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0));
+                    // s_nop();
+                    wmma_gemm.template Run(
+                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    // s_nop();
+                });
+                // Read Consumed Next inner loop B
+                b_thread_copy_.Run(
+                    b_block_desc_k0_n0_n1_n2_k1,
+                    make_tuple(Number<iWmmaK / B_K1>{}, Number<WmmaInnerloop>{}, I0, I0, I0),
+                    b_block_buf,
+                    b_thread_desc_,
+                    make_tuple(I0, Number<WmmaInnerloop>{}, I0, I0, I0),
+                    b_thread_buf);
+            });
+
+            // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat
+            static_for<0, RepeatDiff, 1>{}([&](auto iCut) {
+                static_for<0, NRepeat, 1>{}([&](auto iN) {
+                    vector_type<FloatA, WmmaK> a_thread_vec;
+                    vector_type<FloatB, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto iK) {
+                        a_thread_vec.template AsType<FloatA>()(iK) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                make_tuple(iK / A_K1, iCut, 0, 0, iK % A_K1))>{}];
+                        b_thread_vec.template AsType<FloatB>()(iK) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}];
+                    });
+                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0));
+                    // s_nop();
+                    wmma_gemm.template Run(
+                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    // s_nop();
+                });
+                if constexpr(KPerBlock > WmmaK)
+                {
+                    a_thread_copy_.Run(
+                        a_block_desc_k0_m0_m1_m2_k1,
+                        make_tuple(Number<(iWmmaK + WmmaK) / A_K1>{}, Number<iCut>{}, I0, I0, I0),
+                        a_block_buf,
+                        a_thread_desc_,
+                        make_tuple(I0, Number<iCut>{}, I0, I0, I0),
+                        a_thread_buf);
+                }
+            });
+        });
+
+        // Stage 2: Run FIFO fashion loopover in Square
+        static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop) {
+            // Row Repeatation
+            static_for<WmmaInnerloop, NRepeat, 1>{}([&](auto iN) {
+                vector_type<FloatA, WmmaK> a_thread_vec;
+                vector_type<FloatB, WmmaK> b_thread_vec;
+
+                static_for<0, WmmaK, 1>{}([&](auto iK) {
+                    a_thread_vec.template AsType<FloatA>()(iK) =
+                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                            make_tuple(iK / A_K1, WmmaInnerloop + RepeatDiff, 0, 0, iK % A_K1))>{}];
+                    b_thread_vec.template AsType<FloatB>()(iK) =
+                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                            make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}];
+                });
+                using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                constexpr index_t c_offset =
+                    c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop + RepeatDiff, iN, 0));
+                // s_nop();
+                wmma_gemm.template Run(
+                    a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                    b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                // s_nop();
+            });
+
+            // Col Repeatation
+            static_for<WmmaInnerloop + 1 + RepeatDiff, MRepeat, 1>{}([&](auto iM) {
+                vector_type<FloatA, WmmaK> a_thread_vec;
+                vector_type<FloatB, WmmaK> b_thread_vec;
+
+                static_for<0, WmmaK, 1>{}([&](auto iK) {
+                    a_thread_vec.template AsType<FloatA>()(iK) =
+                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                            make_tuple(iK / A_K1, iM, 0, 0, iK % A_K1))>{}];
+                    b_thread_vec.template AsType<FloatB>()(iK) =
+                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                            make_tuple(iK / B_K1, WmmaInnerloop, 0, 0, iK % B_K1))>{}];
+                });
+                using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                constexpr index_t c_offset =
+                    c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0));
+                // s_nop();
+                wmma_gemm.template Run(
+                    a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                    b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                // s_nop();
+            });
+        });
+    }
+
+    protected:
+    // A[M0, M1, M2, K0 = WmmaK]
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<WmmaK / A_K1>{}, Number<MRepeat>{}, I1, I1, Number<A_K1>{}));
+
+    // B[N0, N1, N2, K0 = WmmaK]
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<WmmaK / B_K1>{}, Number<NRepeat>{}, I1, I1, Number<B_K1>{}));
+
+    // C[M, N, NumRegWMMA]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, wmma_gemm.GetRegSizePerWmma()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         decltype(a_block_desc_k0_m0_m1_m2_k1),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<WmmaK / A_K1, 1, 1, 1, A_K1>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
+                                                         decltype(b_block_desc_k0_n0_n1_n2_k1),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<WmmaK / B_K1, 1, 1, 1, B_K1>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -3,7 +3,6 @@

 #pragma once

-#include <cmath>
 #include <string>
 #include <sstream>


--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A[M, K]
+//   input : B[N, K]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   output : H[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+//   H = layernorm(E)
+// Assume:
+//   D0, D1, ... and E have the same layout
+//   Calculate mean & variance along N dimension in layernorm(E)
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename HLayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename HDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename HElementwiseOperation>
+struct DeviceGemmMultipleDLayernorm : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_h,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t StrideA,
+                        index_t StrideB,
+                        std::array<index_t, NumDTensor> StrideDs,
+                        index_t StrideH,
+                        double epsilon,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op,
+                        HElementwiseOperation h_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+}; // namespace device
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_permute.hpp
@@ -4,7 +4,6 @@
 #pragma once

 #include <array>
-#include <cmath>
 #include <memory>
 #include <type_traits>


--- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -13,10 +13,16 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-template <index_t Rank,
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
          index_t NumReduceDim,
+          typename ReduceOperation,
          typename InElementwiseOperation,
-          typename AccElementwiseOperation>
+          typename AccElementwiseOperation,
+          bool PropagateNan,
+          bool OutputIndex>
 struct DeviceReduce : public BaseOperator
 {
    static constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
@@ -39,12 +45,26 @@ struct DeviceReduce : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

-template <index_t Rank,
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
          index_t NumReduceDim,
+          typename ReduceOperation,
          typename InElementwiseOperation,
-          typename AccElementwiseOperation>
-using DeviceReducePtr = std::unique_ptr<
-    DeviceReduce<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>>;
+          typename AccElementwiseOperation,
+          bool PropagateNan,
+          bool OutputIndex>
+using DeviceReducePtr = std::unique_ptr<DeviceReduce<InDataType,
+                                                     AccDataType,
+                                                     OutDataType,
+                                                     Rank,
+                                                     NumReduceDim,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     PropagateNan,
+                                                     OutputIndex>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -579,6 +579,7 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                                         BatchStrideD1s,
                                         BatchStrideE1}
        {
+#if DEBUG_LOG
            std::cout << "a0_grid_desc_m_k_{" << a0_grid_desc_m_k_.GetLength(I0) << ", "
                      << a0_grid_desc_m_k_.GetLength(I1) << "}" << std::endl;
            std::cout << "b0_grid_desc_n_k_{" << b0_grid_desc_n_k_.GetLength(I0) << ", "
@@ -601,6 +602,7 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                      << std::endl;
            std::cout << "e1_grid_desc_m_n_{" << e1_grid_desc_m_n_.GetLength(I0) << ", "
                      << e1_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+#endif

            static_for<0, NumD0Tensor, 1>{}([&](auto i) {
                using D0Layout   = remove_cvref_t<tuple_element_t<i.value, D0sLayout>>;

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -657,7 +657,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO

        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-#if 0
+#if DEBUG_LOG
            {
                std::cout << "arg.Batch_ = " << arg.Batch_ << std::endl;

@@ -674,8 +674,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceO
                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;

-                std::cout << "arg.reduce_grid_desc_m_{ " << arg.reduce_grid_desc_m_.GetLength(I0) << "}"
-                          << std::endl;
+                std::cout << "arg.reduce_grid_desc_m_{ " << arg.reduce_grid_desc_m_.GetLength(I0)
+                          << "}" << std::endl;
            }
 #endif