Merge remote-tracking branch 'origin/develop' into cpu_avx2

b79df771 · carlushuang · 05d38218 · 63914743 · b79df771 · b79df771
Commit b79df771 authored Jul 12, 2022 by carlushuang
20 changed files
--- a/example/19_binary_elementwise/elementwise_add_4d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_4d.cpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
 #include <iostream>
 #include <cstdlib>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
+#include "ck/ck.hpp"
-#include "binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "device_binary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
 using F16 = ck::half_t;
 using F32 = float;
@@ -42,8 +20,7 @@ using ABDataType             = F16;
 using CDataType              = F16;
 using EltwiseComputeDataType = F32;
-using Add = ck::tensor_operation::binary_element_wise::
+using Add = ck::tensor_operation::element_wise::Add;
-    Add<EltwiseComputeDataType, EltwiseComputeDataType, EltwiseComputeDataType>;
 using DeviceElementwiseAddInstance =
    ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
@@ -104,16 +81,22 @@ int main()
    a_device_buf.ToDevice(a.mData.data());
    b_device_buf.ToDevice(b.mData.data());
+    std::array<const void*, 2> input = {a_device_buf.GetDeviceBuffer(),
+                                        b_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {c_device_buf.GetDeviceBuffer()};
+    std::vector<ck::index_t> a_strides{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()};
+    std::vector<ck::index_t> b_strides{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()};
+    std::vector<ck::index_t> c_strides{c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end()};
    auto broadcastAdd = DeviceElementwiseAddInstance{};
-    auto argument     = broadcastAdd.MakeArgumentPointer(
+    auto argument =
-        a_device_buf.GetDeviceBuffer(),
+        broadcastAdd.MakeArgumentPointer(input,
-        b_device_buf.GetDeviceBuffer(),
+                                         output,
-        c_device_buf.GetDeviceBuffer(),
+                                         std::vector<ck::index_t>{nchw.begin(), nchw.end()},
-        std::vector<ck::index_t>{nchw.begin(), nchw.end()},
+                                         {{a_strides}, b_strides},
-        std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
+                                         {c_strides},
-        std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
+                                         Add{});
-        std::vector<ck::index_t>{c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end()},
-        Add{});
    if(!broadcastAdd.IsSupportedArgument(argument.get()))
    {

--- a/example/20_convnd_bwd_weight_xdl/CMakeLists.txt
+++ b/example/20_convnd_bwd_weight_xdl/CMakeLists.txt
 add_example_executable(example_convnd_bwd_weight_xdl convnd_bwd_weight_xdl.cpp)
-target_link_libraries(example_convnd_bwd_weight_xdl PRIVATE conv_util)
+add_example_executable(example_convnd_bwd_weight_xdl_bf16_splitk convnd_bwd_weight_xdl_bf16_splitk.cpp)
\ No newline at end of file
+target_link_libraries(example_convnd_bwd_weight_xdl PRIVATE conv_util)
+target_link_libraries(example_convnd_bwd_weight_xdl_bf16_splitk PRIVATE conv_util)
\ No newline at end of file
--- a/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
+++ b/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "check_err.hpp"
+#include "ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
-#include "conv_util.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "config.hpp"
-#include "print.hpp"
+#include "ck/library/utility/check_err.hpp"
-#include "device.hpp"
+#include "ck/library/utility/conv_util.hpp"
-#include "host_tensor.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
-#include "host_tensor_generator.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
-#include "device_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "tensor_layout.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
-#include "element_wise_operation.hpp"
-#include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
-#include "reference_conv_backward_weight.hpp"
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -297,52 +297,15 @@ int main(int argc, char* argv[])
                                  split_k);
    // alloc work space
-    size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get());
+    float ave_time = 0.f;
-    float ave_time                   = 0.f;
+    if(!conv->IsSupportedArgument(argument.get()))
-    if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1)
-    {
-        DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
-        wei_work_space_device_buf.SetZero();
-        argument = conv->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<AccDataType*>(wei_work_space_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            params.N_,
-            params.K_,
-            params.C_,
-            params.input_spatial_lengths_,
-            params.filter_spatial_lengths_,
-            output_spatial_lengths,
-            params.conv_filter_strides_,
-            params.conv_filter_dilations_,
-            params.input_left_pads_,
-            params.input_right_pads_,
-            InElementOp{},
-            WeiElementOp{},
-            OutElementOp{},
-            split_k);
-        if(!conv->IsSupportedArgument(argument.get()))
-        {
-            std::cout << "wrong! device_conv with the specified compilation parameters does "
-                         "not support this Conv problem"
-                      << std::endl;
-            return 1;
-        }
-        ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    }
-    else
    {
-        if(!conv->IsSupportedArgument(argument.get()))
+        std::cout << "wrong! device_conv with the specified compilation parameters does "
-        {
+                     "not support this Conv problem"
-            std::cout << "wrong! device_conv with the specified compilation parameters does "
+                  << std::endl;
-                         "not support this Conv problem"
+        return 1;
-                      << std::endl;
-            return 1;
-        }
-        ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
    }
+    ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
    std::size_t flop = ck::utils::conv::get_flops(
        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);

--- a/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+++ b/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/device_unary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/conv_util.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
+using InDataType  = ck::bhalf_t;
+using WeiDataType = ck::bhalf_t;
+using OutDataType = ck::bhalf_t;
+using AccDataType = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+using UnaryTypeConvert = ck::tensor_operation::element_wise::UnaryTypeConvert<ck::bhalf_t, float>;
+using DeviceUnaryElementwiseTypeConvertInstance = ck::tensor_operation::device::
+    DeviceUnaryElementwise<AccDataType, WeiDataType, UnaryTypeConvert, 1, 4>;
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+using DeviceConvBwdWeightBasePtr =
+    ck::tensor_operation::device::DeviceConvBwdWeightPtr<InElementOp, WeiElementOp, OutElementOp>;
+// clang-format off
+template <ck::index_t NumDimSpatial>
+using DeviceConvndBwdWeightInstance_bf16_splitk = ck::tensor_operation::device::
+    DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,                       // InDataType
+        AccDataType,                      // WeiDataType
+        OutDataType,                      // OutDataType
+        AccDataType,                      // AccDataType
+        InElementOp,                      // InElementwiseOperation
+        WeiElementOp,                     // WeiElementwiseOperation
+        OutElementOp,                     // OutElementwiseOperation
+        ConvBwdWeightDefault,             // ConvolutionBackwardWeightSpecialization
+        NumDimSpatial,                    // NumDimSpatial
+        256,                              // BlockSize
+        128,                              // MPerBlock
+        128,                              // NPerBlock
+        4,                                // K0PerBlock
+        8,                                // K1
+        32,                               // MPerXdl
+        32,                               // NPerXdl
+        2,                                // MXdlPerWave
+        2,                                // NXdlPerWave
+        S<1, 4, 16, 4>,                   // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<0, 3, 1, 2>,                    // ABlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1, 3>,                    // ABlockTransferSrcAccessOrder
+        2,                                // ABlockTransferSrcVectorDim
+        8,                                // ABlockTransferSrcScalarPerVector
+        2,                                // ABlockTransferDstScalarPerVector_K1
+        true,                             // ABlockLdsAddExtraM
+        S<1, 4, 16, 4>,                   // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<0, 3, 1, 2>,                    // BBlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1, 3>,                    // BBlockTransferSrcAccessOrder
+        2,                                // BBlockTransferSrcVectorDim
+        8,                                // BBlockTransferSrcScalarPerVector
+        2,                                // BBlockTransferDstScalarPerVector_K1
+        true,                             // BBlockLdsAddExtraN
+        1,                                // CShuffleMXdlPerWavePerShuffle
+        1,                                // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 4>,                   // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        4>;                               // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+template <ck::index_t NumDimSpatial>
+using ReferenceConvBwdWeightInstance =
+    ck::tensor_operation::host::ReferenceConvBwdWeight<InDataType,
+                                                       WeiDataType,
+                                                       OutDataType,
+                                                       InElementOp,
+                                                       WeiElementOp,
+                                                       OutElementOp,
+                                                       NumDimSpatial>;
+template <typename HostTensorB, typename HostTensorA, typename Functor>
+void host_elementwise(HostTensorB& B,
+                      const HostTensorA& A,
+                      const std::vector<std::size_t>& shape,
+                      Functor functor)
+{
+    size_t tensor_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
+    std::cout << __LINE__ << ":" << tensor_size << ", " << A.mData[0] << std::endl;
+    for(std::size_t n = 0; n < tensor_size; ++n)
+    {
+        B.mData[n] = functor(A.mData[n]);
+    }
+}
+void print_use_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
+              << "arg4: is show log (0=no, 1=yes)\n"
+              << "arg5: split-k : in this example split-k must be larger than 1\n"
+              << "arg6: N spatial dimensions (default 2)\n"
+              << "Following arguments (depending on number of spatial dims):\n"
+              << " N, K, C, \n"
+              << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
+              << " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
+              << " <strides>, (ie Sy, Sx for 2D)\n"
+              << " <dilations>, (ie Dy, Dx for 2D)\n"
+              << " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
+              << " <right padding>, (ie RightPy, RightPx for 2D)\n"
+              << std::endl;
+}
+ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
+{
+    // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
+    ck::utils::conv::ConvParams params;
+    int arg_idx = 7;
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_spatial_lengths_.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.conv_filter_strides_.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.conv_filter_dilations_.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_left_pads_.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_right_pads_.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
+    }
+    return params;
+}
+DeviceConvBwdWeightBasePtr get_conv_instance(int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return std::make_unique<DeviceConvndBwdWeightInstance_bf16_splitk<3>>();
+    }
+    case 2: {
+        return std::make_unique<DeviceConvndBwdWeightInstance_bf16_splitk<2>>();
+    }
+    case 1: {
+        return std::make_unique<DeviceConvndBwdWeightInstance_bf16_splitk<1>>();
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int num_dim_spatial  = 2;
+    int do_log           = 0;
+    int split_k          = 2;
+    ck::utils::conv::ConvParams params;
+    params.C_ = 128;
+    if(argc == 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        do_log          = std::stoi(argv[4]);
+        split_k         = std::stoi(argv[5]);
+    }
+    else if(argc > 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        do_log          = std::stoi(argv[4]);
+        split_k         = std::stoi(argv[5]);
+        num_dim_spatial = std::stoi(argv[6]);
+        // check args number
+        int conv_args     = 3 + num_dim_spatial * 6;
+        int cmdline_nargs = conv_args + 7;
+        if(cmdline_nargs != argc)
+        {
+            print_use_msg();
+            exit(1);
+        }
+        params = parse_conv_params(num_dim_spatial, argv);
+    }
+    else if(argc != 1)
+    {
+        print_use_msg();
+        exit(1);
+    }
+    if(split_k <= 1)
+    {
+        print_use_msg();
+        exit(1);
+    }
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
+    input_dims.insert(std::end(input_dims),
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
+    filter_dims.insert(std::end(filter_dims),
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
+    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
+    output_dims.insert(std::end(output_dims),
+                       std::begin(output_spatial_lengths),
+                       std::end(output_spatial_lengths));
+    Tensor<InDataType> in_n_c_hi_wi(
+        ck::utils::conv::get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
+    Tensor<WeiDataType> wei_k_c_y_x_host_result(
+        ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
+    Tensor<WeiDataType> wei_k_c_y_x_device_result(
+        ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
+    Tensor<OutDataType> out_n_k_ho_wo(
+        ck::utils::conv::get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x_device_result.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x_host_result.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        break;
+    default:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) *
+                             wei_k_c_y_x_device_result.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    // reset input to zero
+    wei_device_buf.SetZero();
+    // do GEMM
+    auto conv    = get_conv_instance(num_dim_spatial);
+    auto invoker = conv->MakeInvokerPointer();
+    auto argument =
+        conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                  static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                  static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                  params.N_,
+                                  params.K_,
+                                  params.C_,
+                                  params.input_spatial_lengths_,
+                                  params.filter_spatial_lengths_,
+                                  output_spatial_lengths,
+                                  params.conv_filter_strides_,
+                                  params.conv_filter_dilations_,
+                                  params.input_left_pads_,
+                                  params.input_right_pads_,
+                                  InElementOp{},
+                                  WeiElementOp{},
+                                  OutElementOp{},
+                                  split_k);
+    // alloc work space
+    size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get());
+    if(bwd_weight_workspace_size <= 0)
+    {
+        print_use_msg();
+        exit(1);
+    }
+    float conv_ave_time = 0.f;
+    DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
+    wei_work_space_device_buf.SetZero();
+    conv->SetWorkSpacePointer(argument.get(), wei_work_space_device_buf.GetDeviceBuffer());
+    if(!conv->IsSupportedArgument(argument.get()))
+    {
+        std::cout << "wrong! device_conv with the specified compilation parameters does "
+                     "not support this Conv problem"
+                  << std::endl;
+        return 1;
+    }
+    conv_ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    std::size_t flop = ck::utils::conv::get_flops(
+        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
+    std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
+        params.N_,
+        params.C_,
+        params.K_,
+        params.input_spatial_lengths_,
+        params.filter_spatial_lengths_,
+        output_spatial_lengths);
+    float tflops = static_cast<float>(flop) / 1.E9 / conv_ave_time;
+    float gb_per_sec = num_btype / 1.E6 / conv_ave_time;
+    std::cout << "Perf: conv: " << conv_ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+              << " GB/s" << std::endl;
+    if(do_verification)
+    {
+        auto verify_f = [&](const auto& ref_conv) {
+            auto ref_invoker = ref_conv.MakeInvoker();
+            auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                      wei_k_c_y_x_host_result,
+                                                      out_n_k_ho_wo,
+                                                      params.conv_filter_strides_,
+                                                      params.conv_filter_dilations_,
+                                                      params.input_left_pads_,
+                                                      params.input_right_pads_,
+                                                      InElementOp{},
+                                                      WeiElementOp{},
+                                                      OutElementOp{});
+            ref_invoker.Run(ref_argument);
+            wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "out: ", out_n_k_ho_wo.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",") << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "wei_device(after): ", wei_k_c_y_x_device_result.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(
+                    std::cout << "wei_host  : ", wei_k_c_y_x_host_result.mData, ",")
+                    << std::endl;
+            }
+            return ck::utils::check_err(wei_k_c_y_x_device_result.mData,
+                                        wei_k_c_y_x_host_result.mData)
+                       ? 0
+                       : 1;
+        };
+        switch(num_dim_spatial)
+        {
+        case 3: {
+            auto ref_conv = ReferenceConvBwdWeightInstance<3>();
+            verify_f(ref_conv);
+            break;
+        }
+        case 2: {
+            auto ref_conv = ReferenceConvBwdWeightInstance<2>();
+            verify_f(ref_conv);
+            break;
+        }
+        case 1: {
+            auto ref_conv = ReferenceConvBwdWeightInstance<1>();
+            verify_f(ref_conv);
+            break;
+        }
+        default: {
+            throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+        }
+        }
+    }
+    return 0;
+}
--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
+add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_fp16 gemm_bias_relu_add_layernorm_xdl_fp16.cpp)
 add_example_executable(example_gemm_layernorm_xdl_fp16 gemm_layernorm_xdl_fp16.cpp)
+add_example_executable(example_gemm_xdl_layernorm_single_kernel_fp16 gemm_xdl_layernorm_single_kernel_fp16.cpp)
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using ADataType                = F16;
+using BDataType                = F16;
+using CDataType                = F16;
+using BiasDataType             = F32;
+using D0DataType               = F16;
+using GemmAccDataType          = F32;
+using ReduceAccDataType        = F32;
+using ReduceDataType           = F32;
+using ReducePtrsGlobal         = ck::Tuple<ReduceDataType*, ReduceDataType*>;
+using GammaDataType            = F16;
+using BetaDataType             = F16;
+using LayerNormOutDataType     = F16;
+using NormalizeComputeDataType = F32;
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp  = PassThrough;
+using BElementOp  = PassThrough;
+using CElementOp  = ck::tensor_operation::element_wise::Relu;
+using D0ElementOp = PassThrough;
+using ReduceSumOp = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSumOp, ReduceSumOp>;
+using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
+using UnaryDivElementOp     = ck::tensor_operation::element_wise::UnaryDivide;
+using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps    = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
+using ReduceOutElementOps   = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
+using ReduceGlobalMemOps =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+static constexpr auto GemmSpecialization =
+    ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceGemmBiasAddReduceInstance = ck::tensor_operation::device::DeviceGemmBiasAddReduce_Xdl_CShuffle
+//######| ALayout| BLayout| CLayout|AData| BData| CData|C0Data|C1Data|  GemmAcc| CShuffle| ReduceAcc|       ReduceData|           A|           B|           C|          C1|    Reduce|     ReduceInEleOp|      ReduceAccEleOp|              Reduce|               GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+//######|        |        |        | Type|  Type|  Type|  Type|  Type| DataType| DataType|  DataType|       Type Tuple| Elementwise| Elementwise| Elementwise| Elementwise| Operation|                  |                    |          MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+//######|        |        |        |     |      |      |      |      |         |         |          |                 |   Operation|   Operation|   Operation|   Operation|          |                  |                    |           Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+//######|        |        |        |     |      |      |      |      |         |         |          |                 |            |            |            |            |          |                  |                    |                    |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        <     Row,     Col,     Row,  F16,   F16,   F16,   F32,   F16,      F32,      F32,       F32, ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, D0ElementOp, ReduceOps,ReduceInElementOps, ReduceOutElementOps,  ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
+// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
+using DeviceNormalizeInstance =
+    ck::tensor_operation::device::Device5AryElementwise<CDataType,
+                                                        ReduceDataType,
+                                                        ReduceDataType,
+                                                        GammaDataType,
+                                                        BetaDataType,
+                                                        LayerNormOutDataType,
+                                                        NormalizeComputeDataType,
+                                                        NormalizeFunctor,
+                                                        2,
+                                                        8,
+                                                        8,  // scalarPerVector: gemm_out
+                                                        1,  // scalarPerVector: reduce_mean
+                                                        1,  // scalarPerVector: reduce_mean_square
+                                                        8,  // scalarPerVector: Gamma
+                                                        8,  // scalarPerVector: Beta
+                                                        8>; // scalarPerVector: LayerNorm_out
+auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+    return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                std::vector<std::size_t>({stride}));
+};
+auto f_host_tensor_descriptor2d =
+    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                        std::vector<std::size_t>({stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                        std::vector<std::size_t>({1, stride}));
+        }
+    };
+template <typename CDataType,
+          typename ReduceDataType,
+          typename AccDataType,
+          typename BiasDataType,
+          typename D0DataType,
+          typename A_functor,
+          typename B_functor,
+          typename C_functor,
+          typename C1_functor>
+void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
+                         const Tensor<ADataType>& a_m_k,
+                         const Tensor<ADataType>& b_k_n,
+                         const Tensor<BiasDataType>& bias_n,
+                         const Tensor<D0DataType>& c1_m_n,
+                         const Tensor<GammaDataType>& gamma_n,
+                         const Tensor<GammaDataType>& beta_n,
+                         A_functor a_element_op,
+                         B_functor b_element_op,
+                         C_functor c_element_op,
+                         C1_functor c1_element_op,
+                         int M,
+                         int N)
+{
+    int StrideC = N;
+    Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> mean_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<ReduceDataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
+    auto averageOpInst = UnaryDivElementOp{N};
+    auto ref_gemm    = ReferenceGemmInstance{};
+    auto ref_invoker = ref_gemm.MakeInvoker();
+    auto ref_argument =
+        ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+    ref_invoker.Run(ref_argument);
+    // c = activation(c + bias) + c1_functor(c1)
+    for(int m = 0; m < M; ++m)
+        for(int n = 0; n < N; ++n)
+        {
+            AccDataType acc = ck::type_convert<AccDataType>(c_m_n(m, n)) +
+                              ck::type_convert<AccDataType>(bias_n(n));
+            AccDataType c1 = ck::type_convert<AccDataType>(c1_m_n(m, n));
+            c_element_op(acc, acc);
+            c1_element_op(c1, c1);
+            acc += c1;
+            c_m_n(m, n) = ck::type_convert<CDataType>(acc);
+        }
+    // reduce_mean and reduce_square_mean
+    auto reduceSumOpInst = ReduceSumOp{};
+    for(int m = 0; m < M; ++m)
+    {
+        auto mean_acc        = reduceSumOpInst.GetIdentityValue<AccDataType>();
+        auto square_mean_acc = reduceSumOpInst.GetIdentityValue<AccDataType>();
+        for(int n = 0; n < N; ++n)
+        {
+            AccDataType c_val        = ck::type_convert<AccDataType>(c_m_n(m, n));
+            AccDataType square_c_val = 0;
+            UnarySquareElementOp{}(square_c_val, c_val);
+            reduceSumOpInst(mean_acc, c_val);
+            reduceSumOpInst(square_mean_acc, square_c_val);
+        }
+        averageOpInst(mean_acc, mean_acc);
+        averageOpInst(square_mean_acc, square_mean_acc);
+        mean_m(m)       = ck::type_convert<ReduceDataType>(mean_acc);
+        meanSquare_m(m) = ck::type_convert<ReduceDataType>(square_mean_acc);
+    }
+    // LayerNorm
+    auto layerNormInst = NormalizeFunctor{};
+    for(int m = 0; m < M; ++m)
+    {
+        for(int n = 0; n < N; ++n)
+        {
+            AccDataType out_acc = 0;
+            layerNormInst(out_acc,
+                          ck::type_convert<AccDataType>(c_m_n(m, n)),
+                          ck::type_convert<AccDataType>(mean_m(m)),
+                          ck::type_convert<AccDataType>(meanSquare_m(m)),
+                          ck::type_convert<AccDataType>(gamma_n(n)),
+                          ck::type_convert<AccDataType>(beta_n(n)));
+            out_m_n(m, n) = ck::type_convert<ReduceDataType>(out_acc);
+        }
+    }
+}
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename BiasDataType,
+          typename D0DataType,
+          typename ReduceDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename NormalizeDataType>
+void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M, int N, int K)
+{
+    std::size_t gemm_flop     = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
+    std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
+                                sizeof(D0DataType) * M * N + sizeof(ReduceDataType) * M +
+                                sizeof(ReduceDataType) * M;
+    std::size_t normalize_num_byte = sizeof(CDataType) * M * N + sizeof(ReduceDataType) * M +
+                                     sizeof(ReduceDataType) * M + sizeof(GammaDataType) * N +
+                                     sizeof(BetaDataType) * N + sizeof(NormalizeDataType) * M * N;
+    float tflops               = static_cast<float>(gemm_flop) / 1.E9 / gemm_reduce_time;
+    float gemm_gb_per_sec      = gemm_num_byte / 1.E6 / gemm_reduce_time;
+    float normalize_gb_per_sec = normalize_num_byte / 1.E6 / normalize_time;
+    std::cout << "gemm + reduce_mean + reduce_square_mean Perf: " << gemm_reduce_time << " ms, "
+              << tflops << " TFlops, " << gemm_gb_per_sec << " GB/s, " << std::endl;
+    std::cout << "5-ary elementwise Perf: " << normalize_time << " ms, " << normalize_gb_per_sec
+              << " GB/s, " << std::endl;
+}
+int main()
+{
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+    ck::index_t StrideA  = 1024;
+    ck::index_t StrideB  = 1024;
+    ck::index_t StrideC  = 1024;
+    ck::index_t StrideD0 = 1024;
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+    Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<D0DataType> c1_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> reduceMean_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<ReduceDataType> reduceMeanSquare_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<GammaDataType> gamma_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<LayerNormOutDataType> layerNorm_m_n(
+        f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+    a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
+    b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
+    bias_n.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{-1, 1});
+    c1_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-5, 5});
+    gamma_n.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-1, 1});
+    beta_n.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-1, 1});
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpace());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * c1_m_n.mDesc.GetElementSpace());
+    DeviceMem reduceMean_device_buf(sizeof(ReduceDataType) * reduceMean_m.mDesc.GetElementSpace());
+    DeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) *
+                                          reduceMeanSquare_m.mDesc.GetElementSpace());
+    DeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_n.mDesc.GetElementSpace());
+    DeviceMem beta_device_buf(sizeof(BetaDataType) * beta_n.mDesc.GetElementSpace());
+    DeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) *
+                                   layerNorm_m_n.mDesc.GetElementSpace());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    bias_device_buf.ToDevice(bias_n.mData.data());
+    d0_device_buf.ToDevice(c1_m_n.mData.data());
+    gamma_device_buf.ToDevice(gamma_n.mData.data());
+    beta_device_buf.ToDevice(beta_n.mData.data());
+    auto a_element_op                     = AElementOp{};
+    auto b_element_op                     = BElementOp{};
+    auto c_element_op                     = CElementOp{};
+    auto d_element_op                     = D0ElementOp{};
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
+    auto passthrough                            = UnaryIdenticElementOp{};
+    auto square                                 = UnarySquareElementOp{};
+    auto div                                    = UnaryDivElementOp{N};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&div, &div};
+    std::array<void*, 2> p_reduces = {reduceMean_device_buf.GetDeviceBuffer(),
+                                      reduceMeanSquare_device_buf.GetDeviceBuffer()};
+    // Prepare GEMM, reduce_mean, reduce_mean_square
+    auto gemmReduce          = DeviceGemmBiasAddReduceInstance{};
+    auto gemmReduce_invoker  = gemmReduce.MakeInvoker();
+    auto gemmReduce_argument = gemmReduce.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                                       b_device_buf.GetDeviceBuffer(),
+                                                       bias_device_buf.GetDeviceBuffer(),
+                                                       {d0_device_buf.GetDeviceBuffer()},
+                                                       c_device_buf.GetDeviceBuffer(),
+                                                       p_reduces,
+                                                       M,
+                                                       N,
+                                                       K,
+                                                       StrideA,
+                                                       StrideB,
+                                                       StrideC,
+                                                       {StrideD0},
+                                                       gemm_element_ops,
+                                                       {&d_element_op},
+                                                       reduce_in_element_ops,
+                                                       reduce_out_element_ops);
+    if(!gemmReduce.IsSupportedArgument(gemmReduce_argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    reduceMean_device_buf.SetZero();
+    reduceMeanSquare_device_buf.SetZero();
+    // Prepare LayerNorm
+    std::array<const void*, 5> input = {c_device_buf.GetDeviceBuffer(),
+                                        reduceMean_device_buf.GetDeviceBuffer(),
+                                        reduceMeanSquare_device_buf.GetDeviceBuffer(),
+                                        gamma_device_buf.GetDeviceBuffer(),
+                                        beta_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {layerNorm_device_buf.GetDeviceBuffer()};
+    auto normalize          = DeviceNormalizeInstance{};
+    auto normalize_invoker  = normalize.MakeInvoker();
+    auto normalize_argument = normalize.MakeArgument(input,
+                                                     output,
+                                                     {M, N},
+                                                     {StrideC, 1},
+                                                     {1, 0},
+                                                     {1, 0},
+                                                     {0, 1},
+                                                     {0, 1},
+                                                     {StrideC, 1},
+                                                     NormalizeFunctor{});
+    if(!normalize.IsSupportedArgument(normalize_argument))
+    {
+        throw std::runtime_error("The runtime parameters seems not supported by the "
+                                 "Device5AryElementwise instance, exiting!");
+    }
+    // run kernel
+    gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false});
+    normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, false});
+    bool pass = true;
+    {
+        // verification
+        Tensor<LayerNormOutDataType> host_layerNorm_m_n(
+            f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+        host_gemm_layernorm<CDataType, ReduceDataType, ReduceAccDataType>(host_layerNorm_m_n,
+                                                                          a_m_k,
+                                                                          b_k_n,
+                                                                          bias_n,
+                                                                          c1_m_n,
+                                                                          gamma_n,
+                                                                          beta_n,
+                                                                          a_element_op,
+                                                                          b_element_op,
+                                                                          c_element_op,
+                                                                          d_element_op,
+                                                                          M,
+                                                                          N);
+        layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data());
+        pass &= ck::utils::check_err(layerNorm_m_n.mData,
+                                     host_layerNorm_m_n.mData,
+                                     "Error: Incorrect results layerNorm_m_n",
+                                     1e-2,
+                                     1e-2);
+    }
+    {
+        // evaluate kernel perf
+        bool time_kernel = true;
+        float gemm_reduce_mean_reduce_square_mean_ave_time =
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+        float normalize_ave_time =
+            normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, time_kernel});
+        if(time_kernel)
+            DumpGemmLayerNormPerf<ADataType,
+                                  BDataType,
+                                  CDataType,
+                                  BiasDataType,
+                                  D0DataType,
+                                  ReduceDataType,
+                                  GammaDataType,
+                                  BetaDataType,
+                                  LayerNormOutDataType>(
+                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+    }
+    return pass ? 0 : 1;
+}
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
+#include "ck/ck.hpp"
-#include "check_err.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "config.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "device.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
-#include "host_tensor_generator.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "device_tensor.hpp"
-#include "device_5ary_elementwise.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
-#include "device_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
-#include "element_wise_operation.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "reference_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-#include "gemm_specialization.hpp"
+#include "ck/library/utility/check_err.hpp"
-#include "element_wise_reduce_operation.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -31,8 +33,8 @@ using BDataType                = F16;
 using CDataType                = F16;
 using GemmAccDataType          = F32;
 using ReduceAccDataType        = F32;
-using DDataType                = F32;
+using ReduceDataType           = F32;
-using DPtrsGlobal              = ck::Tuple<DDataType*, DDataType*>;
+using ReducePtrsGlobal         = ck::Tuple<ReduceDataType*, ReduceDataType*>;
 using GammaDataType            = F16;
 using BetaDataType             = F16;
 using LayerNormOutDataType     = F16;
@@ -45,19 +47,16 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
 using AElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using CElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using ReduceSumOp = ck::reduce::Add<ReduceAccDataType>;
+using ReduceSumOp = ck::reduce::Add;
-using DxsReduceOp = ck::Tuple<ReduceSumOp, ReduceSumOp>;
+using ReduceOps   = ck::Tuple<ReduceSumOp, ReduceSumOp>;
-using UnaryIdenticElementOp =
+using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
-    ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, false>;
+using UnaryDivElementOp     = ck::tensor_operation::element_wise::UnaryDivide;
-using UnaryDivElementOp =
+using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
-    ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, true>;
+using ReduceInElementOps    = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
-using UnarySquareElementOp =
+using ReduceOutElementOps   = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
-    ck::tensor_operation::element_wise::UnarySquare<ReduceAccDataType, ReduceAccDataType, false>;
-using DxsInElementOp  = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
+using ReduceGlobalMemOps =
-using DxsOutElementOp = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
-using DxsGlobalMemOp =
    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
@@ -66,11 +65,11 @@ static constexpr auto GemmSpecialization =
 // clang-format off
 using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle
-//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|         Dxs|     DxsInEleOp|     DxsAccEleOp|             D|               GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|        ReduceData|           A|           B|           C|    Reduce|     ReduceInEleOp|      ReduceAccEleOp|             Reduce|               GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
-//######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|               |                |    MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+//######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|        Type Tuple| Elementwise| Elementwise| Elementwise| Operation|                  |                    |         MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
-//######|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|               |                |     Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+//######|        |        |        |     |      |      |         |         |          |                  |   Operation|   Operation|   Operation|          |                  |                    |          Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
-//######|        |        |        |     |      |      |         |         |          |              |            |            |            |            |               |                |              |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+//######|        |        |        |     |      |      |         |         |          |                  |            |            |            |          |                  |                    |                   |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   DPtrsGlobal,  AElementOp,  BElementOp,  CElementOp, DxsReduceOp, DxsInElementOp, DxsOutElementOp,  DxsGlobalMemOp, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,  ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, ReduceOps,ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
@@ -86,8 +85,8 @@ using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
 // A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
 using DeviceNormalizeInstance =
    ck::tensor_operation::device::Device5AryElementwise<CDataType,
-                                                        DDataType,
+                                                        ReduceDataType,
-                                                        DDataType,
+                                                        ReduceDataType,
                                                        GammaDataType,
                                                        BetaDataType,
                                                        LayerNormOutDataType,
@@ -122,7 +121,7 @@ auto f_host_tensor_descriptor2d =
    };
 template <typename CDataType,
-          typename DDataType,
+          typename ReduceDataType,
          typename A_functor,
          typename B_functor,
          typename C_functor>
@@ -141,9 +140,9 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
    int StrideC = N;
    Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<DDataType> mean_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<ReduceDataType> mean_m(f_host_tensor_descriptor1d(M, 1));
-    Tensor<DDataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<ReduceDataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
-    auto averageOpInst = UnaryDivElementOp{M};
+    auto averageOpInst = UnaryDivElementOp{N};
    auto ref_gemm    = ReferenceGemmInstance{};
    auto ref_invoker = ref_gemm.MakeInvoker();
@@ -157,13 +156,14 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
    auto reduceSumOpInst = ReduceSumOp{};
    for(int m = 0; m < M; ++m)
    {
-        float mean_acc        = reduceSumOpInst.GetIdentityValue();
+        auto mean_acc        = reduceSumOpInst.GetIdentityValue<ReduceAccDataType>();
-        float square_mean_acc = reduceSumOpInst.GetIdentityValue();
+        auto square_mean_acc = reduceSumOpInst.GetIdentityValue<ReduceAccDataType>();
        for(int n = 0; n < N; ++n)
        {
-            ReduceAccDataType c_val        = ck::type_convert<float>(c_m_n(m, n));
+            auto c_val        = ck::type_convert<ReduceAccDataType>(c_m_n(m, n));
-            ReduceAccDataType square_c_val = 0;
+            auto square_c_val = reduceSumOpInst.GetIdentityValue<ReduceAccDataType>();
            UnarySquareElementOp{}(square_c_val, c_val);
            reduceSumOpInst(mean_acc, c_val);
@@ -172,8 +172,8 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
        averageOpInst(mean_acc, mean_acc);
        averageOpInst(square_mean_acc, square_mean_acc);
-        mean_m(m)       = ck::type_convert<DDataType>(mean_acc);
+        mean_m(m)       = ck::type_convert<ReduceDataType>(mean_acc);
-        meanSquare_m(m) = ck::type_convert<DDataType>(square_mean_acc);
+        meanSquare_m(m) = ck::type_convert<ReduceDataType>(square_mean_acc);
    }
    // LayerNorm
@@ -183,7 +183,12 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
        for(int n = 0; n < N; ++n)
        {
            float out_f32 = 0;
-            layerNormInst(out_f32, c_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n));
+            layerNormInst(out_f32,
+                          static_cast<float>(c_m_n(m, n)),
+                          static_cast<float>(mean_m(m)),
+                          static_cast<float>(meanSquare_m(m)),
+                          static_cast<float>(gamma_n(n)),
+                          static_cast<float>(beta_n(n)));
            out_m_n(m, n) = static_cast<out_type>(out_f32);
        }
    }
@@ -192,7 +197,7 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
-          typename DDataType,
+          typename ReduceDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename NormalizeDataType>
@@ -200,11 +205,11 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
 {
    std::size_t gemm_flop     = std::size_t(2) * M * N * K;
    std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                                sizeof(CDataType) * M * N + sizeof(DDataType) * M +
+                                sizeof(CDataType) * M * N + sizeof(ReduceDataType) * M +
-                                sizeof(DDataType) * M;
+                                sizeof(ReduceDataType) * M;
-    std::size_t normalize_num_btye = sizeof(CDataType) * M * N + sizeof(DDataType) * M +
+    std::size_t normalize_num_btye = sizeof(CDataType) * M * N + sizeof(ReduceDataType) * M +
-                                     sizeof(DDataType) * M + sizeof(GammaDataType) * N +
+                                     sizeof(ReduceDataType) * M + sizeof(GammaDataType) * N +
                                     sizeof(BetaDataType) * N + sizeof(NormalizeDataType) * M * N;
    float tflops               = static_cast<float>(gemm_flop) / 1.E9 / gemm_reduce_time;
@@ -232,8 +237,8 @@ int main()
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
    Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<DDataType> reduceMean_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<ReduceDataType> reduceMean_m(f_host_tensor_descriptor1d(M, 1));
-    Tensor<DDataType> reduceMeanSquare_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<ReduceDataType> reduceMeanSquare_m(f_host_tensor_descriptor1d(M, 1));
    Tensor<GammaDataType> gamma_n(f_host_tensor_descriptor1d(N, 1));
    Tensor<BetaDataType> beta_n(f_host_tensor_descriptor1d(N, 1));
    Tensor<LayerNormOutDataType> layerNorm_m_n(
@@ -247,8 +252,8 @@ int main()
    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace());
-    DeviceMem reduceMean_device_buf(sizeof(DDataType) * reduceMean_m.mDesc.GetElementSpace());
+    DeviceMem reduceMean_device_buf(sizeof(ReduceDataType) * reduceMean_m.mDesc.GetElementSpace());
-    DeviceMem reduceMeanSquare_device_buf(sizeof(DDataType) *
+    DeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) *
                                          reduceMeanSquare_m.mDesc.GetElementSpace());
    DeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_n.mDesc.GetElementSpace());
    DeviceMem beta_device_buf(sizeof(BetaDataType) * beta_n.mDesc.GetElementSpace());
@@ -260,35 +265,40 @@ int main()
    gamma_device_buf.ToDevice(gamma_n.mData.data());
    beta_device_buf.ToDevice(beta_n.mData.data());
-    auto a_element_op = AElementOp{};
+    auto a_element_op                     = AElementOp{};
-    auto b_element_op = BElementOp{};
+    auto b_element_op                     = BElementOp{};
-    auto c_element_op = CElementOp{};
+    auto c_element_op                     = CElementOp{};
-    auto dxs_global =
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
-        ck::make_tuple(static_cast<DDataType*>(reduceMean_device_buf.GetDeviceBuffer()),
-                       static_cast<DDataType*>(reduceMeanSquare_device_buf.GetDeviceBuffer()));
-    auto dxs_in_element_op  = DxsInElementOp{};
+    auto passthrough                            = UnaryIdenticElementOp{};
-    auto dxs_out_element_op = DxsOutElementOp{M, M};
+    auto square                                 = UnarySquareElementOp{};
+    auto div                                    = UnaryDivElementOp{N};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&div, &div};
+    std::array<void*, 2> p_reduces = {reduceMean_device_buf.GetDeviceBuffer(),
+                                      reduceMeanSquare_device_buf.GetDeviceBuffer()};
    // Prepare GEMM, reduce_mean, reduce_mean_square
-    auto gemmReduce         = DeviceGemmReduceInstance{};
+    auto gemmReduce          = DeviceGemmReduceInstance{};
-    auto gemmReduce_invoker = gemmReduce.MakeInvoker();
+    auto gemmReduce_invoker  = gemmReduce.MakeInvoker();
-    auto gemmReduce_argument =
+    auto gemmReduce_argument = gemmReduce.MakeArgument(a_device_buf.GetDeviceBuffer(),
-        gemmReduce.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                                       b_device_buf.GetDeviceBuffer(),
-                                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                                       nullptr,
-                                static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                                       {},
-                                dxs_global,
+                                                       c_device_buf.GetDeviceBuffer(),
-                                M,
+                                                       p_reduces,
-                                N,
+                                                       M,
-                                K,
+                                                       N,
-                                StrideA,
+                                                       K,
-                                StrideB,
+                                                       StrideA,
-                                StrideC,
+                                                       StrideB,
-                                a_element_op,
+                                                       StrideC,
-                                b_element_op,
+                                                       {},
-                                c_element_op,
+                                                       gemm_element_ops,
-                                dxs_in_element_op,
+                                                       {},
-                                dxs_out_element_op);
+                                                       reduce_in_element_ops,
+                                                       reduce_out_element_ops);
    if(!gemmReduce.IsSupportedArgument(gemmReduce_argument))
    {
@@ -301,23 +311,25 @@ int main()
    reduceMeanSquare_device_buf.SetZero();
    // Prepare LayerNorm
+    std::array<const void*, 5> input = {c_device_buf.GetDeviceBuffer(),
+                                        reduceMean_device_buf.GetDeviceBuffer(),
+                                        reduceMeanSquare_device_buf.GetDeviceBuffer(),
+                                        gamma_device_buf.GetDeviceBuffer(),
+                                        beta_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {layerNorm_device_buf.GetDeviceBuffer()};
    auto normalize          = DeviceNormalizeInstance{};
    auto normalize_invoker  = normalize.MakeInvoker();
-    auto normalize_argument = normalize.MakeArgument(
+    auto normalize_argument = normalize.MakeArgument(input,
-        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                                     output,
-        static_cast<DDataType*>(reduceMean_device_buf.GetDeviceBuffer()),
+                                                     {M, N},
-        static_cast<DDataType*>(reduceMeanSquare_device_buf.GetDeviceBuffer()),
+                                                     {StrideC, 1},
-        static_cast<GammaDataType*>(gamma_device_buf.GetDeviceBuffer()),
+                                                     {1, 0},
-        static_cast<BetaDataType*>(beta_device_buf.GetDeviceBuffer()),
+                                                     {1, 0},
-        static_cast<LayerNormOutDataType*>(layerNorm_device_buf.GetDeviceBuffer()),
+                                                     {0, 1},
-        {M, N},
+                                                     {0, 1},
-        {StrideC, 1},
+                                                     {StrideC, 1},
-        {1, 0},
+                                                     NormalizeFunctor{});
-        {1, 0},
-        {0, 1},
-        {0, 1},
-        {StrideC, 1},
-        NormalizeFunctor{});
    if(!normalize.IsSupportedArgument(normalize_argument))
    {
@@ -335,16 +347,16 @@ int main()
        Tensor<LayerNormOutDataType> host_layerNorm_m_n(
            f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-        host_gemm_layernorm<CDataType, DDataType>(host_layerNorm_m_n,
+        host_gemm_layernorm<CDataType, ReduceDataType>(host_layerNorm_m_n,
-                                                  a_m_k,
+                                                       a_m_k,
-                                                  b_k_n,
+                                                       b_k_n,
-                                                  gamma_n,
+                                                       gamma_n,
-                                                  beta_n,
+                                                       beta_n,
-                                                  a_element_op,
+                                                       a_element_op,
-                                                  b_element_op,
+                                                       b_element_op,
-                                                  c_element_op,
+                                                       c_element_op,
-                                                  M,
+                                                       M,
-                                                  N);
+                                                       N);
        layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data());
        pass &= ck::utils::check_err(layerNorm_m_n.mData,
@@ -367,7 +379,7 @@ int main()
            DumpGemmLayerNormPerf<ADataType,
                                  BDataType,
                                  CDataType,
-                                  DDataType,
+                                  ReduceDataType,
                                  GammaDataType,
                                  BetaDataType,
                                  LayerNormOutDataType>(

--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_gemm.hpp"
-#include "device_tensor.hpp"
-#include "device_base.hpp"
-#include "device_gemm_xdl_c_shuffle_bias_2d.hpp"
-#include "element_wise_operation.hpp"
-#include "reference_gemm_bias_2d.hpp"
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl_layernorm_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+// This example demonstrate a single kernel that runs GEMM layer and laynorm in one fused kernel
+//
+// The GEMM + Layernorm implementation is a specialized kernel which allows fusing both layers
+// together given the condition GEMM extents N of MNK is spanned by a single workgroup. For example,
+// a kernel configured with NPerBlock = 128 allows to operate on all GEMM sizes if N <= 128
+//
+// D = Layernorm(acc_element_op(A * B + broadcast(bias)) + add) * broadcast(gamma) + broadcast(beta)
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
-using ADataType   = ck::half_t;
+using F16 = ck::half_t;
-using BDataType   = ck::half_t;
+using F32 = float;
-using CDataType   = ck::half_t;
-using AccDataType = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using ADataType        = F16;
+using BDataType        = F16;
+using CDataType        = F16;
+using C0DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
 using CLayout = ck::tensor_layout::gemm::RowMajor;
+struct Relu
+{
+    template <typename OutT, typename InT>
+    __host__ __device__ void operator()(OutT& y, const InT& x) const
+    {
+        y = x > 0 ? x : 0;
+    }
+};
 using AElementOp = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-using CElementOp = ck::tensor_operation::element_wise::AlphaBetaAdd;
+// Elementwise operation that operates on the output of matrix multiplication
+// i.e., AccElementOp(A * B + bias)
+using AccElementOp = Relu;
+// Elementwise operation that operates on the output of layer normalization
+using CElementOp = Relu;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_2d<
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl_CShuffle
-    ADataType,              // ADataType
+//######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     C0Data|     GemmAcc|         CShuffle|   ReduceAcc|           A|           B|          Acc|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce|     CReduceThreadCopy|
-    BDataType,              // BDataType
+//######|        |        |        |      Type|      Type|      Type|       Type|    DataType|         DataType|    DataType| Elementwise| Elementwise|  Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector|
-    CDataType,              // CDataType
+//######|        |        |        |          |          |          |           |            |                 |            |   Operation|   Operation|    Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|            _NPerBlock|
-    AccDataType,            // AccDataType
+//######|        |        |        |          |          |          |           |            |                 |            |            |            |             |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                      |
-    ALayout,                // ALayout
+        <     Row,     Col,     Row, ADataType, BDataType, CDataType, C0DataType, AccDataType, CShuffleDataType, AccDataType,  AElementOp,  BElementOp, AccElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8,             S<64, 4>,                     4>;
-    BLayout,                // BLayout
-    CLayout,                // CLayout
-    AElementOp,             // AElementwiseOperation
-    BElementOp,             // BElementwiseOperation
-    CElementOp,             // CElementwiseOperation
-    256,                    // BlockSize
-    256,                    // MPerBlock
-    128,                    // NPerBlock
-    4,                      // K0PerBlock
-    8,                      // K1
-    32,                     // MPerXDL
-    32,                     // NPerXDL
-    4,                      // MXdlPerWave
-    2,                      // NXdlPerWave
-    S<4, 64, 1>,            // ABlockTransferThreadClusterLengths_K0_M_K1
-    S<1, 0, 2>,             // ABlockTransferThreadClusterArrangeOrder
-    S<1, 0, 2>,             // ABlockTransferSrcAccessOrder
-    2,                      // ABlockTransferSrcVectorDim
-    8,                      // ABlockTransferSrcScalarPerVector
-    8,                      // ABlockTransferDstScalarPerVector_K1
-    true,                   // ABlockLdsAddExtraM
-    S<4, 64, 1>,            // BBlockTransferThreadClusterLengths_K0_N_K1
-    S<1, 0, 2>,             // BBlockTransferThreadClusterArrangeOrder
-    S<1, 0, 2>,             // BBlockTransferSrcAccessOrder
-    2,                      // BBlockTransferSrcVectorDim
-    8,                      // BBlockTransferSrcScalarPerVector
-    8,                      // BBlockTransferDstScalarPerVector_K1
-    true,                   // BBlockLdsAddExtraN
-    1,                      // CShuffleMXdlPerWavePerShuffle
-    1,                      // CShuffleNXdlPerWavePerShuffle
-    S<1, 1, 32, 1, 1, 8>,   // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-    8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
-using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D<ADataType,
+using ReferenceInstance = ck::tensor_operation::host::ReferenceGemmLayernorm<ADataType,
-                                                                              BDataType,
+                                                                             BDataType,
-                                                                              CDataType,
+                                                                             CDataType,
-                                                                              CDataType,
+                                                                             C0DataType,
-                                                                              AccDataType,
+                                                                             AccDataType,
-                                                                              AElementOp,
+                                                                             AElementOp,
-                                                                              BElementOp,
+                                                                             BElementOp,
-                                                                              CElementOp>;
+                                                                             AccElementOp,
+                                                                             CElementOp>;
 int main(int argc, char* argv[])
 {
@@ -92,32 +90,24 @@ int main(int argc, char* argv[])
    // GEMM shape
    ck::index_t M = 3840;
-    ck::index_t N = 4096;
+    ck::index_t N = 128;
    ck::index_t K = 4096;
    ck::index_t StrideA = 4096;
    ck::index_t StrideB = 4096;
-    ck::index_t StrideC = 4096;
+    ck::index_t StrideC = 128;
-    float alpha = 1.0f;
-    float beta  = 1.0f;
-    if(argc == 4)
+    if(argc == 1)
    {
-        do_verification = std::stoi(argv[1]);
+        // do nothing
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
    }
-    else if(argc == 6)
+    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
-        alpha = std::stof(argv[4]);
-        beta  = std::stof(argv[5]);
    }
-    else if(argc == 12)
+    else if(argc == 10)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
@@ -130,16 +120,13 @@ int main(int argc, char* argv[])
        StrideA = std::stoi(argv[7]);
        StrideB = std::stoi(argv[8]);
        StrideC = std::stoi(argv[9]);
-        alpha = std::stof(argv[10]);
-        beta  = std::stof(argv[11]);
    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
        exit(0);
    }
@@ -159,14 +146,21 @@ int main(int argc, char* argv[])
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<CDataType> c0_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<AccDataType> acc_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<C0DataType> c0_n_bias(HostTensorDescriptor(std::vector<size_t>({size_t(N)})));
+    Tensor<C0DataType> c0_m_n_add(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<C0DataType> c0_n_gamma(HostTensorDescriptor(std::vector<size_t>({size_t(N)})));
+    Tensor<C0DataType> c0_n_beta(HostTensorDescriptor(std::vector<size_t>({size_t(N)})));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "c0_n_bias: " << c0_n_bias.mDesc << std::endl;
+    std::cout << "c0_m_n_add: " << c0_m_n_add.mDesc << std::endl;
+    std::cout << "c0_n_gamma: " << c0_n_gamma.mDesc << std::endl;
+    std::cout << "c0_n_beta: " << c0_n_beta.mDesc << std::endl;
    switch(init_method)
    {
@@ -174,40 +168,63 @@ int main(int argc, char* argv[])
    case 1:
        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        c0_m_n.GenerateTensorValue(GeneratorTensor_2<CDataType>{-5, 5});
        break;
-    default:
+    case 2:
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        c0_m_n.GenerateTensorValue(GeneratorTensor_3<CDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
    }
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    c0_n_bias.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    c0_m_n_add.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});
-    DeviceMem c0_m_n_device_buf(sizeof(CDataType) * c0_m_n.mDesc.GetElementSpace());
+    c0_n_gamma.GenerateTensorValue(GeneratorTensor_2<C0DataType>{0, 2});
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    c0_n_beta.GenerateTensorValue(GeneratorTensor_2<C0DataType>{0, 5});
+    c_m_n_host_result.GenerateTensorValue(GeneratorTensor_1<CDataType>{0});
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    acc_m_n_host_result.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0});
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    c0_m_n_device_buf.ToDevice(c0_m_n.mData.data());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem c0_bias_buf(sizeof(C0DataType) * c0_n_bias.mDesc.GetElementSpace());
+    DeviceMem c0_add_buf(sizeof(C0DataType) * c0_m_n_add.mDesc.GetElementSpace());
+    DeviceMem c0_gamma_buf(sizeof(C0DataType) * c0_n_gamma.mDesc.GetElementSpace());
+    DeviceMem c0_beta_buf(sizeof(C0DataType) * c0_n_beta.mDesc.GetElementSpace());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    c0_bias_buf.ToDevice(c0_n_bias.mData.data());
+    c0_add_buf.ToDevice(c0_m_n_add.mData.data());
+    c0_gamma_buf.ToDevice(c0_n_gamma.mData.data());
+    c0_beta_buf.ToDevice(c0_n_beta.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto acc_element_op = AccElementOp{};
+    auto c_element_op   = CElementOp{};
    // do GEMM
    auto gemm     = DeviceGemmInstance{};
    auto invoker  = gemm.MakeInvoker();
-    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                      static_cast<CDataType*>(c0_m_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<C0DataType*>(c0_add_buf.GetDeviceBuffer()),
+                                      static_cast<C0DataType*>(c0_bias_buf.GetDeviceBuffer()),
+                                      static_cast<C0DataType*>(c0_gamma_buf.GetDeviceBuffer()),
+                                      static_cast<C0DataType*>(c0_beta_buf.GetDeviceBuffer()),
                                      M,
                                      N,
                                      K,
                                      StrideA,
                                      StrideB,
                                      StrideC,
-                                      AElementOp{},
+                                      a_element_op,
-                                      BElementOp{},
+                                      b_element_op,
-                                      CElementOp{alpha, beta});
+                                      acc_element_op,
+                                      c_element_op);
    if(!gemm.IsSupportedArgument(argument))
    {
@@ -218,36 +235,55 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    std::size_t flop = std::size_t(2) * M * N * K;
+    // extra 6MN flops due to: bias + add + gamma + beta + norm_sub + norm_div,
-    std::size_t num_btype =
+    // excluding reduction steps
-        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+    std::size_t flop = std::size_t(2) * M * N * K + std::size_t(6) * M * N;
+    // extra MN and 3N due to c0_add (MxN), bias (1xN), gamma (1xN), beta (1xN)
+    std::size_t bytes = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                        sizeof(CDataType) * 2 * M * N + sizeof(C0DataType) * 3 * N;
    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    float gb_per_sec = bytes / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    bool pass = true;
    if(do_verification)
    {
-        auto ref_gemm    = ReferenceGemmInstance{};
+        c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        auto ref_gemm    = ReferenceInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
        auto ref_argument = ref_gemm.MakeArgument(a_m_k,
                                                  b_k_n,
-                                                  c0_m_n,
                                                  c_m_n_host_result,
-                                                  AElementOp{},
+                                                  c0_n_bias,
-                                                  BElementOp{},
+                                                  c0_m_n_add,
-                                                  CElementOp{alpha, beta});
+                                                  c0_n_gamma,
+                                                  c0_n_beta,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  c_element_op);
        ref_invoker.Run(ref_argument);
-        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
+        if constexpr(std::is_same<CShuffleDataType, F32>::value)
+        {
+            pass &= ck::utils::check_err(
+                c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c");
+        }
+        else if constexpr(std::is_same<CShuffleDataType, F16>::value)
+        {
+            pass &= ck::utils::check_err(c_m_n_device_result.mData,
+                                         c_m_n_host_result.mData,
+                                         "Error: Incorrect results c",
+                                         1e-2,
+                                         1e-2);
+        }
    }
+    return pass ? 0 : 1;
-    return 0;
 }
--- a/example/22_cgemm/cgemm_xdl_fp16.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
- * Copyright (c) 2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "check_err.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "config.hpp"
+#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
+#include "ck/library/utility/check_err.hpp"
-#include "host_tensor_generator.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
-#include "device_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
-#include "device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
-#include "reference_cgemm.hpp"
-#include "gemm_specialization.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/23_softmax/CMakeLists.txt
+++ b/example/23_softmax/CMakeLists.txt
+add_example_executable(example_softmax_blockwise softmax_blockwise.cpp)
\ No newline at end of file
--- a/example/23_softmax/README.md
+++ b/example/23_softmax/README.md
+# Instructions for ```example_softmax_blockwise```
+## Run ```example_softmax_blockwise```
+```bash
+# -D <xxx> : input 3-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes)
+example_softmax_blockwise -D 4,128,2048 -v 1 1 1
+```
+Result
+```
+launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 0.0242877 ms, 259.039 GB/s, DeviceReduceSoftmax<256,M_C8_S1,K_C32_S8,InSrcVectorDim_1_InSrcVectorSize_8_OutDstVectorSize_8>
+```
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_common_util.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+using namespace ck;
+using namespace ck::tensor_operation::device;
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+constexpr int Rank         = 3;
+constexpr int NumReduceDim = 1;
+using DeviceInstance = DeviceSoftmax<InDataType,
+                                     AccDataType,
+                                     OutDataType,
+                                     Rank,
+                                     NumReduceDim,
+                                     256, // BlockSize
+                                     8,   // ClusterM
+                                     32,  // ClusterK
+                                     1,   // SliceM
+                                     8,   // SliceK
+                                     1,   // SrcVecDim (0=M, 1=K)
+                                     8,   // SrcScalarPerVector
+                                     8>;  // OutScalarPerVector
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+    public:
+    std::vector<size_t> inLengths   = {8, 128, 2048};
+    std::vector<AccDataType> scales = {2.0f, 2.0f};
+    bool do_verification = true;
+    int init_method      = 2;
+    bool time_kernel     = true;
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+        int ch;
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+        return (0);
+    };
+};
+int main(int argc, char* argv[])
+{
+    // Example: batched gemm C[G, M, N] applies max/sum reduction along N internally
+    const std::vector<int> invariantDims{0, 1};
+    const std::vector<int> reduceDims{2};
+    SimpleAppArgs args;
+    if(argc > 1)
+    {
+        if(args.processArgs(argc, argv) < 0)
+            return (-1);
+    };
+    Tensor<InDataType> in(args.inLengths);
+    Tensor<OutDataType> out_ref(args.inLengths);
+    Tensor<OutDataType> out(args.inLengths);
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+    AccDataType alpha = args.scales[0];
+    AccDataType beta  = args.scales[1];
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::size_t num_thread = 1;
+    if(args.do_verification)
+    {
+        switch(args.init_method)
+        {
+        case 0: break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-5.0, 5.0}, num_thread);
+        }
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+    // std::cout << "beta = " << beta << std::endl;
+    // LogRangeAsType<float>(std::cout << "tensor in: " , in.mData, ",") << std::endl;
+    // LogRangeAsType<float>(std::cout << "tensor prior out: " , out.mData, ",") << std::endl;
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+    in_dev.ToDevice(in.mData.data());
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+    if(args.do_verification)
+    {
+        using ReferenceInstance =
+            tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+        ReferenceInstance ref;
+        auto ref_arg = ref.MakeArgument(in, out_ref, alpha, beta, reduceDims);
+        auto invoker = ref.MakeInvoker();
+        invoker.Run(ref_arg);
+        // LogRangeAsType<float>(std::cout << "tensor out_ref: ", out_ref.mData, ",") << std::endl;
+    };
+    std::vector<ck::index_t> i_inLengths;
+    std::vector<ck::index_t> i_inStrides;
+    i_inLengths.assign(args.inLengths.begin(), args.inLengths.end());
+    i_inStrides.assign(inStrides.begin(), inStrides.end());
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths,
+                                                            i_inStrides,
+                                                            reduceDims,
+                                                            &alpha,
+                                                            &beta,
+                                                            in_dev.GetDeviceBuffer(),
+                                                            out_dev.GetDeviceBuffer());
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+        return 1;
+    };
+    std::string instance_name = device_instance.GetTypeString();
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    bool pass = true;
+    if(args.do_verification)
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        out_dev.FromDevice(out.mData.data());
+        // LogRangeAsType<float>(std::cout << "tensor out: " , out.mData, ",") << std::endl;
+        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+    };
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
+    std::size_t num_bytes =
+        in.mDesc.GetElementSize() * sizeof(InDataType) +
+        (beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << instance_name
+              << std::endl;
+    return (pass ? 0 : 1);
+}
--- a/example/24_batched_gemm_c_permute/CMakeLists.txt
+++ b/example/24_batched_gemm_c_permute/CMakeLists.txt
+add_example_executable(example_batched_gemm_c_permute_xdl_fp16 batched_gemm_c_permute_xdl_fp16.cpp)
--- a/example/24_batched_gemm_c_permute/batched_gemm_c_permute_xdl_fp16.cpp
+++ b/example/24_batched_gemm_c_permute/batched_gemm_c_permute_xdl_fp16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_c_permute_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmCPermuteXdl
+//######| ALayout| BLayout| AData| BData| CData| AccData|           A|           B|           C|          GEMM|      Num| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|     CShuffle|    CShuffle| CBlockTransferClusterLengths|   CBlockTransfer|
+//######|        |        |  Type|  Type|  Type|    Type| Elementwise| Elementwise| Elementwise|Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|  ScalarPerVector|
+//######|        |        |      |      |      |        |   Operation|   Operation|   Operation|              |         |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |   PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|    _NWaveNPerXdl|
+//######|        |        |      |      |      |        |            |            |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |             |            |                             |                 |
+//      <     Row,     Col,   F16,   F16,   F16,     F32, PassThrough, PassThrough, PassThrough,     MNPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,            1,           1,               S<1, 32, 1, 8>,                8>;
+        <     Row,     Col,   F16,   F16,   F16,     F32, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,            1,           1,               S<1, 32, 1, 8>,                8>;
+// clang-format on
+using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
+    ReferenceBatchedGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    const int M = 88;
+    const int N = 64;
+    const int K = 88;
+    const int stride_A = K;
+    const int stride_B = K;
+    const int G0 = 1024;
+    const int G1 = 10;
+    const int batch_count = G0 * G1;
+    // output layout - [G0, M, G1, N]
+    const int stride_G0 = M * G1 * N;
+    const int stride_G1 = N;
+    const int stride_M  = G1 * N;
+    const int stride_N  = 1;
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+    // GEMM shape
+    ck::tensor_operation::device::BatchedGemmCPermuteDesc batched_gemm_c_permute_desc{
+        G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N};
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+                                        std::vector<std::size_t>({row * stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+                                        std::vector<std::size_t>({col * stride, 1, stride}));
+        }
+    };
+    Tensor<ADataType> a_g_m_k(f_host_tensor_descriptor(batch_count, M, K, stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(f_host_tensor_descriptor(batch_count, K, N, stride_B, BLayout{}));
+    auto f_host_c_tensor_descriptor = [](std::size_t G0_,
+                                         std::size_t G1_,
+                                         std::size_t M_,
+                                         std::size_t N_,
+                                         std::size_t stride_G0_,
+                                         std::size_t stride_G1_,
+                                         std::size_t stride_M_,
+                                         std::size_t stride_N_) {
+        return HostTensorDescriptor(
+            std::vector<std::size_t>({G0_, G1_, M_, N_}),
+            std::vector<std::size_t>({stride_G0_, stride_G1_, stride_M_, stride_N_}));
+    };
+    Tensor<CDataType> c_g0_g1_m_n_host_result(
+        f_host_c_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
+    Tensor<CDataType> c_g0_g1_m_n_device_result(
+        f_host_c_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "c_g0_g1_m_n: " << c_g0_g1_m_n_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpace());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_g0_g1_m_n_device_result.mDesc.GetElementSpace());
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    // do GEMM
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      batched_gemm_c_permute_desc,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op,
+                                      batch_count);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
+                            sizeof(BDataType) * batch_count * K * N +
+                            sizeof(CDataType) * batch_count * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_g0_g1_m_n_device_result.mData.data());
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+        Tensor<CDataType> c_g_m_n_host_result = HostTensorDescriptor(
+            std::vector<std::size_t>({batch_count, M, N}), std::vector<std::size_t>({M * N, N, 1}));
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        ref_invoker.Run(ref_argument);
+        for(int g0 = 0; g0 < G0; g0++)
+        {
+            for(int g1 = 0; g1 < G1; g1++)
+            {
+                for(int m = 0; m < M; m++)
+                {
+                    for(int n = 0; n < N; n++)
+                    {
+                        int g                                 = g0 * G1 + g1;
+                        c_g0_g1_m_n_host_result(g0, g1, m, n) = c_g_m_n_host_result(g, m, n);
+                    }
+                }
+            }
+        }
+        pass = ck::utils::check_err(c_g0_g1_m_n_host_result.mData,
+                                    c_g0_g1_m_n_device_result.mData,
+                                    "Error: Incorrect results c");
+    }
+    return pass ? 0 : 1;
+}
--- a/example/25_gemm_bias_c_permute/CMakeLists.txt
+++ b/example/25_gemm_bias_c_permute/CMakeLists.txt
+add_example_executable(example_gemm_bias_c_permute_xdl_fp16 gemm_bias_c_permute_xdl_fp16.cpp)
--- a/example/25_gemm_bias_c_permute/gemm_bias_c_permute_xdl_fp16.cpp
+++ b/example/25_gemm_bias_c_permute/gemm_bias_c_permute_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_bias_c_permute_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using EDataType        = F16;
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmBiasCPermute_Xdl
+//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType,  DDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               1>;
+// clang-format on
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 32;
+    ck::index_t M2 = 128;
+    ck::index_t N0 = 16;
+    ck::index_t N1 = 256;
+    // GEMM shape
+    ck::index_t M = M0 * M1 * M2;
+    ck::index_t N = N0 * N1;
+    ck::index_t K = 128;
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+#if 1
+    // E = [M0, N0, M1, N1, M2]
+    ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
+    ck::index_t stride_E_M1 = N1 * M2;
+    ck::index_t stride_E_M2 = 1;
+    ck::index_t stride_E_N0 = M1 * N1 * M2;
+    ck::index_t stride_E_N1 = M2;
+    // D = [0, N0, 0, N1, 0]
+    ck::index_t stride_D_M0 = 0;
+    ck::index_t stride_D_M1 = 0;
+    ck::index_t stride_D_M2 = 0;
+    ck::index_t stride_D_N0 = N1;
+    ck::index_t stride_D_N1 = 1;
+#else
+    // D = [0, 0, 0, N0, N1]
+    ck::index_t stride_D_M0 = 0;
+    ck::index_t stride_D_M1 = 0;
+    ck::index_t stride_D_M2 = 0;
+    ck::index_t stride_D_N0 = N1;
+    ck::index_t stride_D_N1 = 1;
+    // E = [M0, M1, M2, N0, N1]
+    ck::index_t stride_E_M0 = M1 * M2 * N0 * N1;
+    ck::index_t stride_E_M1 = M2 * N0 * N1;
+    ck::index_t stride_E_M2 = N0 * N1;
+    ck::index_t stride_E_N0 = N1;
+    ck::index_t stride_E_N1 = 1;
+#endif
+    const ck::tensor_operation::device::DEGridDesc_M0_M1_M2_N0_N1 d_grid_desc{
+        M0, M1, M2, N0, N1, stride_D_M0, stride_D_M1, stride_D_M2, stride_D_N0, stride_D_N1};
+    const ck::tensor_operation::device::DEGridDesc_M0_M1_M2_N0_N1 e_grid_desc{
+        M0, M1, M2, N0, N1, stride_E_M0, stride_E_M1, stride_E_M2, stride_E_N0, stride_E_N1};
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+    auto f_host_de_tensor_descriptor =
+        [](ck::tensor_operation::device::DEGridDesc_M0_M1_M2_N0_N1 de_grid_desc) {
+            std::size_t m0        = de_grid_desc.M0_;
+            std::size_t m1        = de_grid_desc.M1_;
+            std::size_t m2        = de_grid_desc.M2_;
+            std::size_t n0        = de_grid_desc.N0_;
+            std::size_t n1        = de_grid_desc.N1_;
+            std::size_t stride_m0 = de_grid_desc.stride_M0_;
+            std::size_t stride_m1 = de_grid_desc.stride_M1_;
+            std::size_t stride_m2 = de_grid_desc.stride_M2_;
+            std::size_t stride_n0 = de_grid_desc.stride_N0_;
+            std::size_t stride_n1 = de_grid_desc.stride_N1_;
+            return HostTensorDescriptor(
+                std::vector<std::size_t>({m0, m1, m2, n0, n1}),
+                std::vector<std::size_t>({stride_m0, stride_m1, stride_m2, stride_n0, stride_n1}));
+        };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
+    Tensor<DDataType> d_m0_m1_m2_n0_n1(f_host_de_tensor_descriptor(d_grid_desc));
+    Tensor<EDataType> e_m0_m1_m2_n0_n1_host_result(f_host_de_tensor_descriptor(e_grid_desc));
+    Tensor<EDataType> e_m0_m1_m2_n0_n1_device_result(f_host_de_tensor_descriptor(e_grid_desc));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m0_m1_m2_n0_n1: " << d_m0_m1_m2_n0_n1.mDesc << std::endl;
+    std::cout << "e_m0_m1_m2_n0_n1: " << e_m0_m1_m2_n0_n1_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m0_m1_m2_n0_n1.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m0_m1_m2_n0_n1.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
+    }
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem d_m0_m1_m2_n0_n1_device_buf(sizeof(DDataType) *
+                                          d_m0_m1_m2_n0_n1.mDesc.GetElementSpace());
+    DeviceMem e_m0_m1_m2_n0_n1_device_buf(sizeof(EDataType) *
+                                          e_m0_m1_m2_n0_n1_device_result.mDesc.GetElementSpace());
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    d_m0_m1_m2_n0_n1_device_buf.ToDevice(d_m0_m1_m2_n0_n1.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument  = device_op.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
+                                           b_k_n_device_buf.GetDeviceBuffer(),
+                                           d_m0_m1_m2_n0_n1_device_buf.GetDeviceBuffer(),
+                                           e_m0_m1_m2_n0_n1_device_buf.GetDeviceBuffer(),
+                                           M,
+                                           N,
+                                           K,
+                                           stride_A,
+                                           stride_B,
+                                           d_grid_desc,
+                                           e_grid_desc,
+                                           a_element_op,
+                                           b_element_op,
+                                           cde_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * N + sizeof(EDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor(
+            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        for(int m0 = 0; m0 < M0; ++m0)
+            for(int m1 = 0; m1 < M1; ++m1)
+                for(int m2 = 0; m2 < M2; ++m2)
+                    for(int n0 = 0; n0 < N0; ++n0)
+                        for(int n1 = 0; n1 < N1; ++n1)
+                        {
+                            int m = m0 * M1 * M2 + m1 * M2 + m2;
+                            int n = n0 * N1 + n1;
+                            cde_element_op(e_m0_m1_m2_n0_n1_host_result(m0, m1, m2, n0, n1),
+                                           ck::type_convert<EDataType>(c_m_n(m, n)),
+                                           d_m0_m1_m2_n0_n1(m0, m1, m2, n0, n1));
+                        }
+        e_m0_m1_m2_n0_n1_device_buf.FromDevice(e_m0_m1_m2_n0_n1_device_result.mData.data());
+        return ck::utils::check_err(e_m0_m1_m2_n0_n1_device_result.mData,
+                                    e_m0_m1_m2_n0_n1_host_result.mData)
+                   ? 0
+                   : 1;
+    }
+    return 0;
+}
--- a/example/26_contraction/CMakeLists.txt
+++ b/example/26_contraction/CMakeLists.txt
+add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp)
+add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp)
--- a/example/26_contraction/README.md
+++ b/example/26_contraction/README.md
+# Instructions for ```example_contraction_bilinear_xdl_fp32```
+## Run
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_contraction_bilinear_xdl_fp32 1 1 1
+```
+Result (MI100 @ dynammic freq, 46TFlops peak FP32)
+```
+a_ms_ks: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
+b_ks_ns: dim 4, lengths {32, 64, 32, 64}, strides {128, 1, 524288, 4096}
+c_ms_ns: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
+launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 0.843286 ms, 38.1985 TFlops, 94.5014 GB/s, DeviceContractionMultipleD_Xdl_CShuffle<256, 256, 128, 16, 4, 4>
+```
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F32 = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>;
+using DeviceOpInstanceKNNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>;
+using DeviceOpInstanceMKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>;
+using DeviceOpInstanceMNNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>;
+// clang-format on
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
+struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_ms_ks,
+                 const Tensor<BDataType>& b_ns_ks,
+                 Tensor<EDataType>& e_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_ms_ks_{a_ms_ks},
+              b_ns_ks_{b_ns_ks},
+              e_ms_ns_{e_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<BDataType>& b_ns_ks_;
+        Tensor<EDataType>& e_ms_ns_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_M2_N2_K2::Argument;
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
+                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+                AccDataType v_acc = 0;
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    for(int k1 = 0; k1 < K1; ++k1)
+                    {
+                        AccDataType v_a;
+                        AccDataType v_b;
+                        arg.a_element_op_(
+                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(
+                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
+                        v_acc += v_a * v_b;
+                    }
+                }
+                AccDataType v_c;
+                arg.cde_element_op_(v_c, v_acc);
+                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
+            };
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+                             const Tensor<BDataType>& b_ns_ks,
+                             Tensor<EDataType>& e_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "ReferenceContraction_M2_N2_K2"
+            << std::endl;
+        // clang-format on
+        return str.str();
+    }
+};
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+    float alpha = 1.f;
+    float beta  = 1.f;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 28)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+        d_ms_ns_lengths = {M0, M1, N0, N1};
+        d_ms_ns_strides = {
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
+        alpha = std::stof(argv[26]);
+        beta  = std::stof(argv[27]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 7: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+        printf("arg22 to 25: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg26 to 27: alpha, beta\n");
+        exit(0);
+    }
+    Tensor<ADataType> a_ms_ks(
+        std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
+        std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
+    Tensor<BDataType> b_ns_ks(
+        std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
+        std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
+    Tensor<EDataType> d_ms_ns(
+        std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()),
+        std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end()));
+    Tensor<EDataType> e_ms_ns_host_result(
+        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+    Tensor<EDataType> e_ms_ns_device_result(
+        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl;
+    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpace());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpace());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpace());
+    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_ms_ns.mData.data());
+    // set zero
+    e_device_buf.SetZero();
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+        return 0;
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
+                                    e_ms_ns_lengths.begin() + NumDimM,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+    ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
+                                    e_ms_ns_lengths.begin() + NumDimM + NumDimN,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+    ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
+                                    a_ms_ks_lengths.begin() + NumDimM + NumDimK,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(
+            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                  NumDimN,
+                                                                  NumDimK,
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  CShuffleDataType,
+                                                                  AccDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  PassThrough>;
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
+                                       c_ms_ns_host_result(m0, m1, n0, n1),
+                                       d_ms_ns(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+        return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1;
+    }
+    return 0;
+}
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F32 = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceOpInstanceKKN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>;
+using DeviceOpInstanceKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>;
+using DeviceOpInstanceMKN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>;
+using DeviceOpInstanceMNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>;
+// clang-format on
+using DeviceOpInstance = DeviceOpInstanceKKN;
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
+struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_ms_ks,
+                 const Tensor<BDataType>& b_ns_ks,
+                 Tensor<EDataType>& e_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_ms_ks_{a_ms_ks},
+              b_ns_ks_{b_ns_ks},
+              e_ms_ns_{e_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<BDataType>& b_ns_ks_;
+        Tensor<EDataType>& e_ms_ns_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_M2_N2_K2::Argument;
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
+                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+                AccDataType v_acc = 0;
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    for(int k1 = 0; k1 < K1; ++k1)
+                    {
+                        AccDataType v_a;
+                        AccDataType v_b;
+                        arg.a_element_op_(
+                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(
+                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
+                        v_acc += v_a * v_b;
+                    }
+                }
+                AccDataType v_c;
+                arg.cde_element_op_(v_c, v_acc);
+                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
+            };
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+                             const Tensor<BDataType>& b_ns_ks,
+                             Tensor<EDataType>& e_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "ReferenceContraction_M2_N2_K2"
+            << std::endl;
+        // clang-format on
+        return str.str();
+    }
+};
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+    float scale = 1.f;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 23)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
+        scale = std::stof(argv[26]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 7: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg22: scale\n");
+        exit(0);
+    }
+    Tensor<ADataType> a_ms_ks(
+        std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
+        std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
+    Tensor<BDataType> b_ns_ks(
+        std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
+        std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
+    Tensor<EDataType> e_ms_ns_host_result(
+        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+    Tensor<EDataType> e_ms_ns_device_result(
+        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpace());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpace());
+    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_ns_ks.mData.data());
+    // set zero
+    e_device_buf.SetZero();
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{scale};
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 0>{},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 0>{},
+                                    std::array<std::vector<ck::index_t>, 0>{},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+        return 0;
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
+                                    e_ms_ns_lengths.begin() + NumDimM,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+    ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
+                                    e_ms_ns_lengths.begin() + NumDimM + NumDimN,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+    ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
+                                    a_ms_ks_lengths.begin() + NumDimM + NumDimK,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + +sizeof(EDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(
+            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                  NumDimN,
+                                                                  NumDimK,
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  CShuffleDataType,
+                                                                  AccDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  PassThrough>;
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
+                                       c_ms_ns_host_result(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+        return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1;
+    }
+    return 0;
+}