Merge remote-tracking branch 'origin/develop' into fix_0813

4ed59413 · Chao Liu · 8bea6b2d · 0bd6b842 · 4ed59413 · 8bea6b2d
Commit 4ed59413 authored Aug 13, 2022 by Chao Liu
20 changed files
--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
@@ -9,7 +9,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
+#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -29,7 +29,7 @@ using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
 constexpr int Rank         = 2;
 constexpr int NumReduceDim = 1;
-using DeviceInstance = ck::tensor_operation::device::DeviceLayernorm<XDataType,
+using DeviceInstance = ck::tensor_operation::device::DeviceLayernormImpl<XDataType,
                                                                         GammaDataType,
                                                                         BetaDataType,
                                                                         AccDataType,
@@ -90,6 +90,7 @@ int main()
        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{gamma.mDesc.GetStrides().begin(), gamma.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{beta.mDesc.GetStrides().begin(), beta.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
        {1},
        1e-4,
        x_dev.GetDeviceBuffer(),

--- a/example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
-add_example_executable(example_grouped_convnd_fwd_bias_relu_xdl_fp16 grouped_convnd_fwd_bias_relu_xdl_fp16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_xdl_fp16 PRIVATE utility)
--- a/example/30_grouped_convnd_fwd_bias_relu/README.md
+++ b/example/30_grouped_convnd_fwd_bias_relu/README.md
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: time kernel (0=no, 1=yes)
-#Following arguments (depending on number of spatial dims):
-# N spatial dimensions
-# G, N, K, C,
-# <filter spatial dimensions>, (ie Y, X for 2D)
-# <input image spatial dimensions>, (ie Hi, Wi for 2D)
-# <strides>, (ie Sy, Sx for 2D)
-# <dilations>, (ie Dy, Dx for 2D)
-# <left padding>, (ie LeftPy, LeftPx for 2D)
-# <right padding>, (ie RightPy, RightPx for 2D)
-bin/example_grouped_convnd_fwd_bias_relu_xdl_fp16 1 1 1
-```
-Result (MI100)
-```
-in: dim 5, lengths {1, 128, 192, 71, 71}, strides {6912, 967872, 1, 13632, 192}
-wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {192, 1728, 1, 576, 192}
-bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 1.19215 ms, 123.112 TFlops, 279.827 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
-```
--- a/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
+++ b/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <cstdlib>
-#include <iostream>
-#include <numeric>
-#include <type_traits>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
-void print_helper_msg()
-{
-    std::cout << "arg1: verification (0=no, 1=yes)\n"
-              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: time kernel (0=no, 1=yes)\n"
-              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
-}
-template <ck::index_t NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp,
-          typename DeviceConvNDFwdInstance>
-int run_grouped_conv_fwd_bias(bool do_verification,
-                              int init_method,
-                              bool time_kernel,
-                              const ck::utils::conv::ConvParam& conv_param,
-                              const HostTensorDescriptor& in_g_n_c_wis_desc,
-                              const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                              const HostTensorDescriptor& bias_g_n_k_wos_desc,
-                              const HostTensorDescriptor& out_g_n_k_wos_desc,
-                              const InElementOp& in_element_op,
-                              const WeiElementOp& wei_element_op,
-                              const OutElementOp& out_element_op)
-{
-    Tensor<InDataType> in(in_g_n_c_wis_desc);
-    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
-    Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
-    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
-    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
-    std::cout << "in: " << in.mDesc << std::endl;
-    std::cout << "wei: " << wei.mDesc << std::endl;
-    std::cout << "bias: " << bias.mDesc << std::endl;
-    std::cout << "out: " << out_host.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
-    }
-    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
-    in_device_buf.ToDevice(in.mData.data());
-    wei_device_buf.ToDevice(wei.mData.data());
-    bias_device_buf.ToDevice(bias.mData.data());
-    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
-    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
-    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
-    std::array<ck::index_t, NDimSpatial> input_left_pads{};
-    std::array<ck::index_t, NDimSpatial> input_right_pads{};
-    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
-    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
-    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
-    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
-    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
-    copy(bias_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths);
-    copy(bias_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
-    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
-    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
-    copy(conv_param.conv_filter_strides_, conv_filter_strides);
-    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
-    copy(conv_param.input_left_pads_, input_left_pads);
-    copy(conv_param.input_right_pads_, input_right_pads);
-    // do Conv
-    auto conv     = DeviceConvNDFwdInstance{};
-    auto invoker  = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(
-        in_device_buf.GetDeviceBuffer(),
-        wei_device_buf.GetDeviceBuffer(),
-        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
-        out_device_buf.GetDeviceBuffer(),
-        a_g_n_c_wis_lengths,
-        a_g_n_c_wis_strides,
-        b_g_k_c_xs_lengths,
-        b_g_k_c_xs_strides,
-        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}},
-        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}},
-        e_g_n_k_wos_lengths,
-        e_g_n_k_wos_strides,
-        conv_filter_strides,
-        conv_filter_dilations,
-        input_left_pads,
-        input_right_pads,
-        in_element_op,
-        wei_element_op,
-        out_element_op);
-    if(!conv.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device_conv with the specified compilation parameters does "
-            "not support this Conv problem");
-    }
-    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    std::size_t flop      = conv_param.GetFlops();
-    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
-    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-    float gb_per_sec = num_btype / 1.E6 / avg_time;
-    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << conv.GetTypeString() << std::endl;
-    if(do_verification)
-    {
-        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-        Tensor<OutDataType> c_host(out_g_n_k_wos_desc);
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
-                                                                     InDataType,
-                                                                     WeiDataType,
-                                                                     OutDataType,
-                                                                     InElementOp,
-                                                                     WeiElementOp,
-                                                                     PassThrough>();
-        auto ref_invoker  = ref_conv.MakeInvoker();
-        auto ref_argument = ref_conv.MakeArgument(in,
-                                                  wei,
-                                                  c_host,
-                                                  conv_param.conv_filter_strides_,
-                                                  conv_param.conv_filter_dilations_,
-                                                  conv_param.input_left_pads_,
-                                                  conv_param.input_right_pads_,
-                                                  in_element_op,
-                                                  wei_element_op,
-                                                  PassThrough{});
-        ref_invoker.Run(ref_argument);
-        // TODO: implement elementwise operation for host
-        out_host.ForEach(
-            [&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
-        out_device_buf.FromDevice(out_device.mData.data());
-        return ck::utils::check_err(
-                   out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f)
-                   ? 0
-                   : 1;
-    }
-    return 0;
-}
--- a/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "grouped_convnd_fwd_bias_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-using InDataType       = ck::half_t;
-using WeiDataType      = ck::half_t;
-using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
-using BiasDataType     = ck::half_t;
-using OutDataType      = ck::half_t;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
-static constexpr auto ConvSpec =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-#if 1
-template <ck::index_t NDimSpatial,
-          typename InLayout,
-          typename WeiLayout,
-          typename BiasLayout,
-          typename OutLayout>
-using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-        NDimSpatial,
-        InLayout,
-        WeiLayout,
-        ck::Tuple<BiasLayout>,
-        OutLayout,
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<BiasDataType>,
-        OutDataType,
-        InElementOp,
-        WeiElementOp,
-        OutElementOp,
-        ConvSpec,    // ConvForwardSpecialization
-        GemmSpec,    // GemmSpecialization
-        1,           //
-        256,         // BlockSize
-        128,         // MPerBlock
-        256,         // NPerBlock
-        32,          // KPerBlock
-        8,           // AK1
-        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
-        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-        2,           // ABlockTransferSrcVectorDim
-        8,           // ABlockTransferSrcScalarPerVector
-        8,           // ABlockTransferDstScalarPerVector_AK1
-        1,           // ABlockLdsExtraM
-        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-        2,           // BBlockTransferSrcVectorDim
-        8,           // BBlockTransferSrcScalarPerVector
-        8,           // BBlockTransferDstScalarPerVector_BK1
-        1,           // BBlockLdsExtraN
-        1,
-        1,
-        S<1, 32, 1, 8>,
-        8>;
-#else
-template <ck::index_t NDimSpatial,
-          typename InLayout,
-          typename WeiLayout,
-          typename BiasLayout,
-          typename OutLayout>
-using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-        NDimSpatial,
-        InLayout,
-        WeiLayout,
-        ck::Tuple<BiasLayout>,
-        OutLayout,
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<BiasDataType>,
-        OutDataType,
-        InElementOp,
-        WeiElementOp,
-        OutElementOp,
-        ConvSpec,    // ConvForwardSpecialization
-        GemmSpec,    // GemmSpecialization
-        1,           //
-        256,         // BlockSize
-        256,         // MPerBlock
-        16,          // NPerBlock
-        32,          // KPerBlock
-        8,           // AK1
-        8,           // BK1
-        16,          // MPerXdl
-        16,          // NPerXdl
-        4,           // MXdlPerWave
-        1,           // NXdlPerWave
-        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-        2,           // ABlockTransferSrcVectorDim
-        8,           // ABlockTransferSrcScalarPerVector
-        8,           // ABlockTransferDstScalarPerVector_AK1
-        1,           // ABlockLdsExtraM
-        S<4, 16, 4>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-        2,           // BBlockTransferSrcVectorDim
-        2,           // BBlockTransferSrcScalarPerVector
-        2,           // BBlockTransferDstScalarPerVector_BK1
-        1,           // BBlockLdsExtraN
-        4,           // CShuffleMXdlPerWavePerShuffle
-        1,           // CShuffleNXdlPerWavePerShuffle
-        S<1, 256, 1, 1>,
-        1>;
-#endif
-int main(int argc, char* argv[])
-{
-    namespace ctc = ck::tensor_layout::convolution;
-    print_helper_msg();
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    // conventional group conv definition
-    // G = 2
-    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
-    // [K, C,  Y,  X] =  [512, 192,  3,  3]
-    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
-    // CK group conv definition
-    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
-    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
-    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
-    ck::utils::conv::ConvParam conv_param{
-        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
-    if(argc == 1)
-    {
-        // use default
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else
-    {
-        do_verification                   = std::stoi(argv[1]);
-        init_method                       = std::stoi(argv[2]);
-        time_kernel                       = std::stoi(argv[3]);
-        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
-        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
-    }
-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
-    if(conv_param.num_dim_spatial_ == 1)
-    {
-        using InLayout   = ctc::G_NW_C;
-        using WeiLayout  = ctc::G_K_X_C;
-        using BiasLayout = ctc::G_NW_K;
-        using OutLayout  = ctc::G_NW_K;
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
-            {
-                conv_param.C_,                                                        // g
-                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
-                1,                                                                    // c
-                conv_param.G_ * conv_param.C_                                         // wi
-            });
-        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
-            {
-                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
-                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
-                1,                                                                     // c
-                conv_param.C_                                                          // x
-            });
-        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_, // g
-                0,             // k
-                1,             // c
-                0              // x
-            });
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_,                                                         // g
-                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
-                1,                                                                     // k
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-        return run_grouped_conv_fwd_bias<
-            1,
-            InDataType,
-            WeiDataType,
-            OutDataType,
-            InElementOp,
-            WeiElementOp,
-            OutElementOp,
-            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, BiasLayout, OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    else if(conv_param.num_dim_spatial_ == 2)
-    {
-        using InLayout   = ctc::G_NHW_C;
-        using WeiLayout  = ctc::G_K_YX_C;
-        using BiasLayout = ctc::G_NHW_K;
-        using OutLayout  = ctc::G_NHW_K;
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
-            {conv_param.G_,
-             conv_param.N_,
-             conv_param.C_,
-             conv_param.input_spatial_lengths_[0],
-             conv_param.input_spatial_lengths_[1]},
-            {
-                conv_param.C_, // g
-                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
-                    conv_param.G_ * conv_param.C_,                                    // n
-                1,                                                                    // c
-                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
-                conv_param.G_ * conv_param.C_                                         // wi
-            });
-        const auto wei_g_k_c_xs_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.K_,
-                                  conv_param.C_,
-                                  conv_param.filter_spatial_lengths_[0],
-                                  conv_param.filter_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
-                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
-                                     conv_param.filter_spatial_lengths_[0] *
-                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
-                                     1,                                                         // c
-                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
-                                     conv_param.C_                                              // x
-                                 });
-        const auto bias_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // ho
-                                     0              // wo
-                                 });
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_,
-             conv_param.N_,
-             conv_param.K_,
-             conv_param.output_spatial_lengths_[0],
-             conv_param.output_spatial_lengths_[1]},
-            {
-                conv_param.K_, // g
-                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
-                    conv_param.G_ * conv_param.K_,                                     // n
-                1,                                                                     // k
-                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-        return run_grouped_conv_fwd_bias<
-            2,
-            InDataType,
-            WeiDataType,
-            OutDataType,
-            InElementOp,
-            WeiElementOp,
-            OutElementOp,
-            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, BiasLayout, OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    else if(conv_param.num_dim_spatial_ == 3)
-    {
-        using InLayout   = ctc::G_NDHW_C;
-        using WeiLayout  = ctc::G_K_ZYX_C;
-        using BiasLayout = ctc::G_NDHW_K;
-        using OutLayout  = ctc::G_NDHW_K;
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
-            {conv_param.G_,
-             conv_param.N_,
-             conv_param.C_,
-             conv_param.input_spatial_lengths_[0],
-             conv_param.input_spatial_lengths_[1],
-             conv_param.input_spatial_lengths_[2]},
-            {
-                conv_param.C_, // g
-                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
-                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
-                1,                                                                        // c
-                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
-                    conv_param.G_ * conv_param.C_,                                    // di
-                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
-                conv_param.G_ * conv_param.C_                                         // wi
-            });
-        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
-            {conv_param.G_,
-             conv_param.K_,
-             conv_param.C_,
-             conv_param.filter_spatial_lengths_[0],
-             conv_param.filter_spatial_lengths_[1],
-             conv_param.filter_spatial_lengths_[2]},
-            {
-                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
-                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
-                    conv_param.C_, // g
-                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
-                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
-                1,                                                         // c
-                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
-                    conv_param.C_,                                     // z
-                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
-                conv_param.C_                                          // x
-            });
-        const auto bias_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1],
-                                  conv_param.output_spatial_lengths_[2]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // z
-                                     0,             // y
-                                     0              // x
-                                 });
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_,
-             conv_param.N_,
-             conv_param.K_,
-             conv_param.output_spatial_lengths_[0],
-             conv_param.output_spatial_lengths_[1],
-             conv_param.output_spatial_lengths_[2]},
-            {
-                conv_param.K_, // g
-                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
-                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
-                1,                                                                         // k
-                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
-                    conv_param.G_ * conv_param.K_,                                     // do
-                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-        return run_grouped_conv_fwd_bias<
-            3,
-            InDataType,
-            WeiDataType,
-            OutDataType,
-            InElementOp,
-            WeiElementOp,
-            OutElementOp,
-            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, BiasLayout, OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    return 0;
-}
--- a/example/31_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/31_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
--- a/example/31_grouped_convnd_fwd_bias_relu_add/README.md
+++ b/example/31_grouped_convnd_fwd_bias_relu_add/README.md
--- a/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
+++ b/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
--- a/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
--- a/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
--- a/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
--- a/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/31_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
+add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    32,          // KPerBlock
+    128,         // Gemm1NPerBlock
+    32,          // Gemm1KPerBlock
+    8,           // AK1
+    8,           // BK1
+    2,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<8, 32, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    2,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    // GEMM shape
+    ck::index_t M             = 1024;
+    ck::index_t N             = 1024;
+    ck::index_t K             = 64;
+    ck::index_t O             = 128;
+    ck::index_t BatchCount    = 4;
+    ck::index_t StrideA       = -1;
+    ck::index_t StrideB0      = -1;
+    ck::index_t StrideB1      = -1;
+    ck::index_t StrideC       = -1;
+    ck::index_t BatchStrideA  = -1;
+    ck::index_t BatchStrideB0 = -1;
+    ck::index_t BatchStrideB1 = -1;
+    ck::index_t BatchStrideC  = -1;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+        BatchCount = std::stoi(argv[8]);
+    }
+    else if(argc == 17)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+        BatchCount = std::stoi(argv[8]);
+        StrideA  = std::stoi(argv[9]);
+        StrideB0 = std::stoi(argv[10]);
+        StrideB1 = std::stoi(argv[11]);
+        StrideC  = std::stoi(argv[12]);
+        BatchStrideA  = std::stoi(argv[13]);
+        BatchStrideB0 = std::stoi(argv[14]);
+        BatchStrideB1 = std::stoi(argv[15]);
+        BatchStrideC  = std::stoi(argv[16]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 17: M, N, K, O, Batch, StrideA, StrideB0, StrideB1, StrideC, BatchStrideA, "
+               "BatchStrideB0, BatchStrideB1, BatchStrideC\n");
+        exit(0);
+    }
+    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
+    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
+    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
+    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
+    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
+    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+    // C_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<CDataType> c_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
+        break;
+    case 2:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
+    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) * c_g_m_o_device_result.mDesc.GetElementSize());
+    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+    // do GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    auto argument =
+        gemm.MakeArgument(static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          O,
+                          BatchCount,
+                          StrideA,
+                          StrideB0,
+                          StrideB1,
+                          StrideC,
+                          BatchStrideA,
+                          BatchStrideB0,
+                          BatchStrideB1,
+                          BatchStrideC,
+                          a_element_op,
+                          b0_element_op,
+                          acc0_element_op,
+                          b1_element_op,
+                          c_element_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+        return 0;
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                             sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                            BatchCount;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
+    if(do_verification)
+    {
+        // Output of Gemm0 is input A of Gemm1
+        Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, a1_g_m_n, a_element_op, b0_element_op, PassThrough{});
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+        return ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData) ? 0 : 1;
+    }
+    return 0;
+}
--- a/example/32_batched_gemm_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_gemm/CMakeLists.txt
--- a/example/32_batched_gemm_gemm/batched_gemm_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_gemm/batched_gemm_softmax_gemm_xdl_fp16.cpp
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -44,6 +44,6 @@ add_subdirectory(26_contraction)
 add_subdirectory(27_layernorm)
 add_subdirectory(28_grouped_gemm_bias_e_permute)
 add_subdirectory(29_batched_gemm_bias_e_permute)
-add_subdirectory(30_grouped_convnd_fwd_bias_relu)
+add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
-add_subdirectory(31_grouped_convnd_fwd_bias_relu_add)
+add_subdirectory(31_batched_gemm_gemm)
-add_subdirectory(32_batched_gemm_gemm)
+add_subdirectory(32_batched_gemm_softmax_gemm)
--- a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/utility/reduction_common.hpp"
+namespace ck {
+// clang-format off
+// Assume:
+//  1) work_buffer is buffer (typically LDS) allocated outside as workspace
+//  2) work_buffer has T elements, and space size is no less than 3*BlockSize
+//  3) mean_value, var_value and count is the input data in vgpr from each thread
+//  4) mean_value, var_value and count is the over-written reduced output in vgpr for each thread
+//  5) Merge mean and M from ThreadwiseWelford
+// clang-format on
+template <typename T,
+          index_t BlockSize,
+          typename ThreadClusterLengths_M_K,
+          typename ThreadClusterArrangeOrder,
+          bool GetActualVariance = true>
+struct BlockwiseWelford
+{
+    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
+                  "The product of cluster lengths should be same as BlockSize!");
+    static constexpr auto BufferLength_M = ThreadClusterLengths_M_K::At(0);
+    static constexpr auto BufferLength_K = ThreadClusterLengths_M_K::At(1);
+    static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BufferLength_M>{}, Number<BufferLength_K>{}));
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+    __device__ static inline void
+    Merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
+    {
+        int count            = count_a + count_b;
+        T count_b_over_count = count == 0 ? type_convert<T>(0) : type_convert<T>(count_b) / count;
+        T delta              = mean_b - mean_a;
+        mean_a += delta * count_b_over_count;
+        var_a += var_b + delta * delta * count_a * count_b_over_count;
+        count_a = count;
+    }
+    __device__ static void Run(T& mean_value, T& var_value, int& count)
+    {
+        __shared__ T mean_block_buf[BlockSize];
+        __shared__ T var_block_buf[BlockSize];
+        __shared__ int count_block_buf[BlockSize];
+        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(get_thread_local_1d_id()));
+        const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
+        const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
+        index_t offset1 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx);
+        mean_block_buf[offset1]  = mean_value;
+        var_block_buf[offset1]   = var_value;
+        count_block_buf[offset1] = count;
+        block_sync_lds();
+        static_for<0, cluster_len_shift, 1>{}([&](auto I) {
+            constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I());
+            if(thread_k_cluster_id < indOffset)
+            {
+                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
+                                                                     make_tuple(0, indOffset));
+                T mean1    = mean_block_buf[offset1];
+                T var1     = var_block_buf[offset1];
+                int count1 = count_block_buf[offset1];
+                T mean2    = mean_block_buf[offset2];
+                T var2     = var_block_buf[offset2];
+                int count2 = count_block_buf[offset2];
+                Merge(mean1, var1, count1, mean2, var2, count2);
+                mean_block_buf[offset1]  = mean1;
+                var_block_buf[offset1]   = var1;
+                count_block_buf[offset1] = count1;
+            }
+            block_sync_lds();
+        });
+        index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0));
+        count      = count_block_buf[offset];
+        mean_value = mean_block_buf[offset];
+        if constexpr(GetActualVariance)
+            var_value = var_block_buf[offset] / count;
+        else
+            var_value = var_block_buf[offset];
+    };
+};
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_layernorm.hpp
@@ -9,13 +9,48 @@
 #include "ck/utility/reduction_operator.hpp"
 #include "ck/tensor_operation/gpu/device/device_normalization.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
-#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_layernorm_welford_variance.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+namespace ck {
+template <typename GridwiseReduction,
+          typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename AccElementwiseOperation,
+          typename GridDesc_M_K,
+          typename GridDesc_K>
+__global__ void kernel_layernorm(const GridDesc_M_K x_grid_desc_m_k,
+                                 const GridDesc_K gamma_grid_desc_k,
+                                 const GridDesc_K beta_grid_desc_k,
+                                 const GridDesc_M_K y_grid_desc_m_k,
+                                 index_t num_k_block_tile_iteration,
+                                 AccDataType epsilon,
+                                 const XDataType* const __restrict__ p_x_global,
+                                 const GammaDataType* const __restrict__ p_gamma_global,
+                                 const BetaDataType* const __restrict__ p_beta_global,
+                                 YDataType* const __restrict__ p_y_global,
+                                 const AccElementwiseOperation acc_elementwise_op)
+{
+    GridwiseReduction::Run(x_grid_desc_m_k,
+                           gamma_grid_desc_k,
+                           beta_grid_desc_k,
+                           y_grid_desc_m_k,
+                           num_k_block_tile_iteration,
+                           epsilon,
+                           p_x_global,
+                           p_gamma_global,
+                           p_beta_global,
+                           p_y_global,
+                           acc_elementwise_op);
+};
+} // namespace ck
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -39,7 +74,7 @@ template <typename XDataType,
          index_t GammaSrcVectorSize,
          index_t BetaSrcVectorSize,
          index_t YDstVectorSize>
-struct DeviceLayernorm : public DeviceNormalization2<XDataType,
+struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
                                                    GammaDataType,
                                                    BetaDataType,
                                                    AccDataType,
@@ -58,27 +93,74 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
    using PassThrough = tensor_operation::element_wise::PassThrough;
-    // Used for freeloading of some handy functions from DeviceReduceMultiBlock
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    using Reduction = DeviceReduceMultiBlock<XDataType,
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-                                             AccDataType,
-                                             YDataType,
+    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
-                                             Rank,
+                                    const std::vector<index_t>& inStrides,
-                                             NumReduceDim,
+                                    int blkGroupSize,
-                                             reduce::Add,
+                                    int numBlockTileIteration)
-                                             PassThrough,             // InElementwiseOperation
+    {
-                                             AccElementwiseOperation, // AccElementwiseOperation
+        constexpr index_t NumInvariantDim  = Rank - NumReduceDim;
-                                             InMemoryDataOperationEnum::Set,
+        static constexpr index_t numSrcDim = Rank;
-                                             false, // PropagateNan
+        static constexpr bool reduceAllDim = (NumInvariantDim == 0);
-                                             false, // OutputIndex
-                                             false, // HaveIndexInputIfOutputIndex
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
-                                             BlockSize,
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
-                                             MThreadClusterSize,
-                                             KThreadClusterSize,
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-                                             MThreadSliceSize,
-                                             KThreadSliceSize,
+        const auto in_grid_desc_m_k = [&]() {
-                                             XYSrcVectorDim,
+            if constexpr(reduceAllDim)
-                                             XSrcVectorSize,
+            {
-                                             1>; // YDstVectorSize
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+                const auto reduceDimLengths =
+                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
+                const auto invariantDimLengths =
+                    make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return (in_grid_desc_m_k_padded);
+    };
    static auto MakeAffine1dDescriptor(const std::vector<index_t>& Lengths,
                                       const std::vector<index_t>& Strides,
@@ -97,7 +179,7 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
            make_tuple(Sequence<0>{}));
        const auto reduceTotalLength = grid_desc_k.GetLength(Number<0>{});
-        const int reduceSizePerBlock = Reduction::K_BlockTileSize * numBlockTileIteration;
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
        const auto Pad_K = reduceSizePerBlock * blkGroupSize - reduceTotalLength;
@@ -110,10 +192,11 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
        return (grid_desc_k_padded);
    };
-    using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
+    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
    using GridDesc_K   = decltype(MakeAffine1dDescriptor({1}, {1}, 1, 1));
-    using GridwiseReduceLayernormGeneric = GridwiseLayernorm_mk_to_mk<XDataType,
+    using GridwiseReduceLayernormGeneric =
+        GridwiseLayernormWelfordVariance_mk_to_mk<XDataType,
                                                  GammaDataType,
                                                  BetaDataType,
                                                  YDataType,
@@ -134,7 +217,8 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
                                                  YDstVectorSize,
                                                  false>;
-    using GridwiseReduceLayernormSweepOnce = GridwiseLayernorm_mk_to_mk<XDataType,
+    using GridwiseReduceLayernormSweepOnce =
+        GridwiseLayernormWelfordVariance_mk_to_mk<XDataType,
                                                  GammaDataType,
                                                  BetaDataType,
                                                  YDataType,
@@ -155,12 +239,13 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
                                                  YDstVectorSize,
                                                  true>;
-    struct Argument : public Reduction::Argument
+    struct Argument : public BaseArgument
    {
        Argument(const std::vector<index_t> lengths,
                 const std::vector<index_t> xStrides,
                 const std::vector<index_t> gammaStrides,
                 const std::vector<index_t> betaStrides,
+                 const std::vector<index_t> yStrides,
                 const std::vector<index_t> reduceDims,
                 AccElementwiseOperation acc_elementwise_op,
                 AccDataType epsilon,
@@ -168,53 +253,76 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
                 const GammaDataType* p_gamma,
                 const BetaDataType* p_beta,
                 YDataType* p_y)
-            : Reduction::Argument(lengths,
+            : epsilon_(epsilon),
-                                  xStrides,
+              p_x_(p_x),
-                                  {},
-                                  {},
-                                  reduceDims,
-                                  0.0f, // alpha
-                                  0.0f, // beta
-                                  p_x,
-                                  nullptr,
-                                  p_y,
-                                  nullptr,
-                                  acc_elementwise_op,
-                                  PassThrough{}),
-              epsilon_(epsilon),
              p_gamma_(p_gamma),
              p_beta_(p_beta),
+              p_y_(p_y),
              gammaStrides_(gammaStrides),
-              betaStrides_(betaStrides)
+              betaStrides_(betaStrides),
+              acc_elementwise_op_(acc_elementwise_op)
        {
-            reduceLength_.resize(NumReduceDim);
+            Lengths_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(lengths, reduceDims);
+            xStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(xStrides, reduceDims);
+            yStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(yStrides, reduceDims);
+            long_index_t invariant_total_length;
+            long_index_t reduce_total_length;
+            std::tie(invariant_total_length, reduce_total_length) =
+                get_2d_lengths<Rank, NumReduceDim>(Lengths_);
+            blkGroupSize_          = 1;
+            numBlockTileIteration_ = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+            gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
+                        M_BlockTileSize * blkGroupSize_;
+            reduceLengths_.resize(NumReduceDim);
            for(int i = 0; i < NumReduceDim; ++i)
            {
-                reduceLength_[i] = lengths[reduceDims[i]];
+                reduceLengths_[i] = lengths[reduceDims[i]];
            }
        }
        AccDataType epsilon_;
+        const XDataType* p_x_;
        const GammaDataType* p_gamma_;
        const BetaDataType* p_beta_;
-        std::vector<index_t> reduceLength_;
+        YDataType* p_y_;
+        std::vector<index_t> Lengths_;
+        std::vector<index_t> xStrides_;
+        std::vector<index_t> reduceLengths_;
        std::vector<index_t> gammaStrides_;
        std::vector<index_t> betaStrides_;
+        std::vector<index_t> yStrides_;
+        AccElementwiseOperation acc_elementwise_op_;
+        int blkGroupSize_;
+        int numBlockTileIteration_;
+        size_t gridSize_;
    };
    struct Invoker : public BaseInvoker
    {
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            const auto x_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+            const auto x_grid_desc_m_k = MakeSrc2dDescriptor(
-                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+                arg.Lengths_, arg.xStrides_, arg.blkGroupSize_, arg.numBlockTileIteration_);
-            const auto gamma_grid_desc_k = MakeAffine1dDescriptor(
+            const auto gamma_grid_desc_k = MakeAffine1dDescriptor(arg.reduceLengths_,
-                arg.reduceLength_, arg.gammaStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+                                                                  arg.gammaStrides_,
-            const auto beta_grid_desc_k = MakeAffine1dDescriptor(
+                                                                  arg.blkGroupSize_,
-                arg.reduceLength_, arg.betaStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+                                                                  arg.numBlockTileIteration_);
-            const auto y_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+            const auto beta_grid_desc_k  = MakeAffine1dDescriptor(arg.reduceLengths_,
-                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+                                                                 arg.betaStrides_,
+                                                                 arg.blkGroupSize_,
+                                                                 arg.numBlockTileIteration_);
+            const auto y_grid_desc_m_k   = MakeSrc2dDescriptor(
+                arg.Lengths_, arg.yStrides_, arg.blkGroupSize_, arg.numBlockTileIteration_);
            bool sweep_once =
                x_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
@@ -241,19 +349,19 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
            float avg_time = 0;
            avg_time += launch_and_time_kernel(stream_config,
                                               kernel_main,
-                                               dim3(arg.gridSize),
+                                               dim3(arg.gridSize_),
                                               dim3(BlockSize),
                                               0,
                                               x_grid_desc_m_k,
                                               gamma_grid_desc_k,
                                               beta_grid_desc_k,
                                               y_grid_desc_m_k,
-                                               arg.numBlockTileIteration,
+                                               arg.numBlockTileIteration_,
                                               arg.epsilon_,
-                                               arg.in_dev_,
+                                               arg.p_x_,
                                               arg.p_gamma_,
                                               arg.p_beta_,
-                                               arg.out_dev_,
+                                               arg.p_y_,
                                               arg.acc_elementwise_op_);
            return (avg_time);
@@ -270,12 +378,33 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
    {
        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
-        if(!Reduction::IsSupportedArgument(p_arg_))
+        constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+        if constexpr(XYSrcVectorDim == 0)
+        {
+            if constexpr(NumInvariantDim == 0)
+            {
+                return false;
+            }
+            else
            {
+                if(p_arg_->xStrides_[NumInvariantDim - 1] != 1)
                    return false;
+                if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0)
+                    return false;
+            };
        }
+        else
+        {
+            if(p_arg_->xStrides_[Rank - 1] != 1)
+                return false;
+            if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0)
+                return false;
+        };
-        if(p_arg_->inLengths_[Rank - 1] % YDstVectorSize != 0)
+        if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0)
        {
            return false;
        }
@@ -309,6 +438,7 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
                        const std::vector<index_t> xStrides,
                        const std::vector<index_t> gammaStrides,
                        const std::vector<index_t> betaStrides,
+                        const std::vector<index_t> yStrides,
                        const std::vector<index_t> reduceDims,
                        AccDataType epsilon,
                        const void* p_x,
@@ -321,6 +451,7 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
                                          xStrides,
                                          gammaStrides,
                                          betaStrides,
+                                          yStrides,
                                          reduceDims,
                                          acc_elementwise_op,
                                          epsilon,
@@ -340,7 +471,7 @@ struct DeviceLayernorm : public DeviceNormalization2<XDataType,
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceLayernorm<" << BlockSize << ",";
+        str << "DeviceLayernormImpl<" << BlockSize << ",";
        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
        str << "XYSrcVectorDim_" << XYSrcVectorDim  << ",";