Merge branch 'develop' into gemm_layernorm_welford

5aa3c344 · rocking5566 · GitHub · 7fefc966 · 9d8f834a · 5aa3c344
Unverified Commit 5aa3c344 authored Oct 05, 2022 by rocking5566 Committed by GitHub Oct 05, 2022
20 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -58,7 +58,7 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
 using B1ElementOp   = PassThrough;
 using CElementOp    = PassThrough;

-static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNOPadding;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;

 using DeviceGemmInstance =
    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
@@ -117,7 +117,8 @@ using DeviceGemmInstance =
        1,              // CShuffleMXdlPerWavePerShuffle
        2,              // CShuffleNXdlPerWavePerShuffle
        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        false>;         // MaskOutUpperTriangle

 // Ref Gemm0: fp16 in, fp32 out
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
@@ -149,8 +150,8 @@ int main(int argc, char* argv[])

    // GEMM shape for A/B0/B1/C
    // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o
-    ck::index_t M             = 128;
-    ck::index_t N             = 1024;
+    ck::index_t M             = 120;
+    ck::index_t N             = 1000;
    ck::index_t K             = 64;
    ck::index_t O             = 128;
    ck::index_t StrideA       = -1;

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
@@ -55,7 +55,7 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
 using B1ElementOp   = PassThrough;
 using CElementOp    = PassThrough;

-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;

 using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
    ALayout,
@@ -73,7 +73,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
    Acc0ElementOp,
    B1ElementOp,
    CElementOp,
-    GemmDefault,
+    GemmSpec,
    1,
    256,
    128,         // MPerBlock
@@ -113,7 +113,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
    1,              // CShuffleMXdlPerWavePerShuffle
    2,              // CShuffleNXdlPerWavePerShuffle
    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+    8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+    false>;

 // Ref Gemm0: fp16 in, fp32 out
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
@@ -144,8 +145,8 @@ int main(int argc, char* argv[])
    bool time_kernel     = false;

    // GEMM shape
-    ck::index_t M             = 1024;
-    ck::index_t N             = 1024;
+    ck::index_t M             = 1020;
+    ck::index_t N             = 1020;
    ck::index_t K             = 64;
    ck::index_t O             = 128;
    ck::index_t BatchCount    = 4;

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
+                                                                  |-----------------|
+                                                                          Gemm0
+                                                          |-------------------------------------|
+                                                                          Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+
+using CPermuteNumDims_G_M_O =
+    S<1, 1, 1>; // "using CLayout = Row" has been replaced by CPermuteNumDims_M_O
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        ALayout,
+        B0Layout,
+        B1Layout,
+        CPermuteNumDims_G_M_O,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        false>;
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    float alpha = 1; // scaling after 1st gemm
+
+    std::size_t group_count = 13;
+
+    // Problem descs
+    std::vector<DeviceGemmInstance::ProblemDesc> problem_descs;
+    std::vector<const void*> p_a;
+    std::vector<const void*> p_b0;
+    std::vector<const void*> p_b1;
+    std::vector<void*> p_c;
+
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        int M     = 128 * (rand() % 8 + 1);
+        int N     = 128 * (rand() % 8 + 1);
+        int K     = 40;
+        int O     = 40 * (rand() % 2 + 1);
+        int Batch = rand() % 8 + 1;
+
+        const int StrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+        const int StrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+        const int StrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+
+        const int BatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+        const int BatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+        const int BatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+
+        std::vector<ck::index_t> c_gs_ms_os_lengths{Batch, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides{O, Batch * O, 1};
+
+        problem_descs.push_back({M,
+                                 N,
+                                 K,
+                                 O,
+                                 Batch,
+                                 StrideA,
+                                 StrideB0,
+                                 StrideB1,
+                                 BatchStrideA,
+                                 BatchStrideB0,
+                                 BatchStrideB1,
+                                 c_gs_ms_os_lengths,
+                                 c_gs_ms_os_strides});
+    }
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+
+    std::vector<Tensor<ADataType>> a_tensors;
+    std::vector<Tensor<B0DataType>> b0_tensors;
+    std::vector<Tensor<B1DataType>> b1_tensors;
+    std::vector<Tensor<CDataType>> c_tensors;
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+
+    std::vector<DeviceMemPtr> a_tensors_device;
+    std::vector<DeviceMemPtr> b0_tensors_device;
+    std::vector<DeviceMemPtr> b1_tensors_device;
+    std::vector<DeviceMemPtr> c_tensors_device;
+
+    std::size_t flop = 0, num_byte = 0;
+
+    std::cout << "group count " << group_count << ". printing first 4 groups\n";
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        const auto& M                  = problem_descs[i].M;
+        const auto& N                  = problem_descs[i].N;
+        const auto& K                  = problem_descs[i].K;
+        const auto& O                  = problem_descs[i].O;
+        const auto& Batch              = problem_descs[i].Batch;
+        const auto& StrideA            = problem_descs[i].StrideA;
+        const auto& StrideB0           = problem_descs[i].StrideB0;
+        const auto& StrideB1           = problem_descs[i].StrideB1;
+        const auto& BatchStrideA       = problem_descs[i].BatchStrideA;
+        const auto& BatchStrideB0      = problem_descs[i].BatchStrideB0;
+        const auto& BatchStrideB1      = problem_descs[i].BatchStrideB1;
+        const auto& c_gs_ms_os_lengths = problem_descs[i].c_gs_ms_os_lengths;
+        const auto& c_gs_ms_os_strides = problem_descs[i].c_gs_ms_os_strides;
+
+        // C_m_o = A_m_k * B0_k_n * B1_n_o
+        Tensor<ADataType> a_g_m_k(
+            f_host_tensor_descriptor(Batch, M, K, StrideA, BatchStrideA, ALayout{}));
+        Tensor<B0DataType> b0_g_k_n(
+            f_host_tensor_descriptor(Batch, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+        Tensor<B1DataType> b1_g_n_o(
+            f_host_tensor_descriptor(Batch, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+        Tensor<CDataType> c_gs_ms_os_device_result(
+            std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
+            std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
+
+        flop += (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * Batch;
+        num_byte += (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                    Batch;
+
+        if(i < 4)
+        {
+            std::cout << "a_g_m_k[" << i << "]: " << a_g_m_k.mDesc << ", "
+                      << "b0_g_k_n[" << i << "]: " << b0_g_k_n.mDesc << ", "
+                      << "b1_g_n_o[" << i << "]: " << b1_g_n_o.mDesc << ", "
+                      << "c_gs_ms_os[" << i << "]: " << c_gs_ms_os_device_result.mDesc << std::endl;
+        }
+
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+            b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+            b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+            break;
+        case 2:
+            a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+            b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+            break;
+        case 3:
+            a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+            b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+            b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+            break;
+        default:
+            a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+            b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        }
+
+        a_tensors.push_back(a_g_m_k);
+        b0_tensors.push_back(b0_g_k_n);
+        b1_tensors.push_back(b1_g_n_o);
+        c_tensors.push_back(c_gs_ms_os_device_result);
+
+        a_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize()));
+        b0_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSpaceSize()));
+        b1_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSpaceSize()));
+        c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(CDataType) * c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()));
+
+        a_tensors_device[i]->ToDevice(a_g_m_k.mData.data());
+        b0_tensors_device[i]->ToDevice(b0_g_k_n.mData.data());
+        b1_tensors_device[i]->ToDevice(b1_g_n_o.mData.data());
+
+        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
+        p_b0.push_back(b0_tensors_device[i]->GetDeviceBuffer());
+        p_b1.push_back(b1_tensors_device[i]->GetDeviceBuffer());
+        p_c.push_back(c_tensors_device[i]->GetDeviceBuffer());
+    }
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(p_a,
+                                      p_b0,
+                                      p_b1,
+                                      p_c,
+                                      problem_descs,
+                                      a_element_op,
+                                      b0_element_op,
+                                      acc0_element_op,
+                                      b1_element_op,
+                                      c_element_op);
+
+    // specify workspace for problem_desc
+    DeviceMem problem_desc_workspace(gemm.GetWorkSpaceSize(&argument));
+
+    gemm.SetWorkSpacePointer(&argument, problem_desc_workspace.GetDeviceBuffer());
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        for(std::size_t i = 0; i < group_count; i++)
+        {
+            const auto& M                  = problem_descs[i].M;
+            const auto& N                  = problem_descs[i].N;
+            const auto& O                  = problem_descs[i].O;
+            const auto& Batch              = problem_descs[i].Batch;
+            const auto& c_gs_ms_os_lengths = problem_descs[i].c_gs_ms_os_lengths;
+            const auto& c_gs_ms_os_strides = problem_descs[i].c_gs_ms_os_strides;
+
+            const auto& a_g_m_k            = a_tensors[i];
+            const auto& b0_g_k_n           = b0_tensors[i];
+            const auto& b1_g_n_o           = b1_tensors[i];
+            auto& c_gs_ms_os_device_result = c_tensors[i];
+            auto& c_gs_ms_os_device_buf    = *c_tensors_device[i];
+
+            Tensor<CDataType> c_gs_ms_os_host_result(
+                std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
+                std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
+
+            c_gs_ms_os_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
+
+            // Output of Gemm0 is input A of Gemm1
+            Tensor<AccDataType> acc0_m_n(f_host_tensor_descriptor(Batch, M, N, N, M * N, Row{}));
+
+            Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(Batch, M, N, N, M * N, Row{}));
+
+            Tensor<CDataType> c_g_m_o_host_result(std::vector<int>{Batch, M, O},
+                                                  std::vector<int>{M * O, O, 1});
+
+            auto ref_gemm0          = ReferenceGemm0Instance{};
+            auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+            auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+                a_g_m_k, b0_g_k_n, acc0_m_n, a_element_op, b0_element_op, acc0_element_op);
+
+            ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+            auto ref_softmax          = ReferenceSoftmaxInstance{};
+            auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+            auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_m_n, a1_g_m_n, 1, 0, {2});
+
+            ref_softmax_invoker.Run(ref_softmax_argument);
+
+            auto ref_gemm1          = ReferenceGemm1Instance{};
+            auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+            auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n,
+                                                             b1_g_n_o,
+                                                             c_g_m_o_host_result,
+                                                             PassThrough{},
+                                                             b1_element_op,
+                                                             c_element_op);
+
+            ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+            // Note: in this example, we merely permute the dimensions by changing underlying
+            // strides so we simply access data as-is
+            c_gs_ms_os_host_result.ForEach(
+                [&](auto& self, auto idx) { self(idx) = c_g_m_o_host_result(idx); });
+
+            bool pass_ =
+                ck::utils::check_err(c_gs_ms_os_device_result.mData, c_gs_ms_os_host_result.mData);
+            pass &= pass_;
+        }
+    }
+
+    return pass ? 0 : 1;
+}
--- a/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
+add_example_executable(example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp)
--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using A0DataType        = F16;
+using B0DataType        = F16;
+using Acc0DataType      = F32;
+using D00DataType       = F16;
+using D01DataType       = F16;
+using B1DataType        = F16;
+using Acc1DataType      = F32;
+using C1ShuffleDataType = F32;
+using D1DataType        = F16;
+using E1DataType        = F16;
+
+using A0Layout  = Row;
+using B0Layout  = Col;
+using D00Layout = Row;
+using D01Layout = Row;
+using B1Layout  = Row;
+using D1Layout  = Row;
+using E1Layout  = Row;
+
+// E = Relu(C + D0 + D1)
+struct AddAddRelu
+{
+    __host__ __device__ void
+    operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const ck::half_t x = c + d0 + d1;
+
+        ck::tensor_operation::element_wise::Relu{}.template operator()<ck::half_t>(e, x);
+    }
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::Relu{}.template operator()<float>(e, x);
+    }
+};
+
+// E = Gelu(C + D0 + D1)
+struct AddAddGelu
+{
+    __host__ __device__ void
+    operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const ck::half_t x = c + d0 + d1;
+
+        ck::tensor_operation::element_wise::Gelu{}.template operator()<ck::half_t, ck::half_t>(e,
+                                                                                               x);
+    }
+
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::Gelu{}.template operator()<float, float>(e, x);
+    }
+};
+
+// E = FastGelu(C + D0 + D1)
+struct AddAddFastGelu
+{
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(e, x);
+    }
+};
+
+using A0ElementOp   = PassThrough;
+using B0ElementOp   = PassThrough;
+using CDE0ElementOp = AddAddRelu;
+using A1ElementOp   = PassThrough;
+using B1ElementOp   = PassThrough;
+using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr bool PadGemm0M = false;
+static constexpr bool PadGemm0N = false;
+static constexpr bool PadGemm0K = false;
+static constexpr bool PadGemm1N = false;
+static constexpr bool PadGemm1K = false;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<
+        A0Layout,
+        B0Layout,
+        ck::Tuple<D00Layout, D01Layout>,
+        B1Layout,
+        ck::Tuple<D1Layout>,
+        E1Layout,
+        A0DataType,
+        B0DataType,
+        Acc0DataType,
+        ck::Tuple<D00DataType, D01DataType>,
+        B1DataType,
+        Acc1DataType,
+        C1ShuffleDataType,
+        ck::Tuple<D1DataType>,
+        E1DataType,
+        A0ElementOp,
+        B0ElementOp,
+        CDE0ElementOp,
+        B1ElementOp,
+        CDE1ElementOp,
+        PadGemm0M,
+        PadGemm0N,
+        PadGemm0K,
+        PadGemm1N,
+        PadGemm1K,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<8, 32, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M              = 1024;
+    ck::index_t N              = 1024;
+    ck::index_t K              = 64;
+    ck::index_t O              = 128;
+    ck::index_t BatchCount     = 4;
+    ck::index_t StrideA0       = -1;
+    ck::index_t StrideB0       = -1;
+    ck::index_t StrideD00      = -1;
+    ck::index_t StrideD01      = -1;
+    ck::index_t StrideB1       = -1;
+    ck::index_t StrideD1       = -1;
+    ck::index_t StrideE1       = -1;
+    ck::index_t BatchStrideA0  = -1;
+    ck::index_t BatchStrideB0  = -1;
+    ck::index_t BatchStrideD00 = -1;
+    ck::index_t BatchStrideD01 = -1;
+    ck::index_t BatchStrideB1  = -1;
+    ck::index_t BatchStrideD1  = -1;
+    ck::index_t BatchStrideE1  = -1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+    }
+    else if(argc == 23)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+
+        StrideA0  = std::stoi(argv[9]);
+        StrideB0  = std::stoi(argv[10]);
+        StrideD00 = std::stoi(argv[11]);
+        StrideD01 = std::stoi(argv[12]);
+        StrideB1  = std::stoi(argv[13]);
+        StrideD1  = std::stoi(argv[14]);
+        StrideE1  = std::stoi(argv[15]);
+
+        BatchStrideA0  = std::stoi(argv[16]);
+        BatchStrideB0  = std::stoi(argv[17]);
+        BatchStrideD00 = std::stoi(argv[18]);
+        BatchStrideD01 = std::stoi(argv[19]);
+        BatchStrideB1  = std::stoi(argv[20]);
+        BatchStrideD1  = std::stoi(argv[21]);
+        BatchStrideE1  = std::stoi(argv[22]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 8: M, N, K, O, Batch\n");
+        printf(
+            "arg9 to 15: StrideA0, StrideB0, StrideD00, StrideD01, StrideB1, StrideD1, StrideE1\n");
+        printf("arg16 to 22: BatchStrideA0, BatchStrideB0, BatchStrideD00, BatchStrideD01, "
+               "BatchStrideB1, BatchStrideD1, BatchStrideE1 \n");
+        exit(0);
+    }
+
+    const int DefaultStrideA0  = ck::is_same_v<A0Layout, Row> ? K : M;
+    const int DefaultStrideB0  = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideD00 = ck::is_same_v<D00Layout, Row> ? N : M;
+    const int DefaultStrideD01 = ck::is_same_v<D01Layout, Row> ? N : M;
+    const int DefaultStrideB1  = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideD1  = ck::is_same_v<D1Layout, Row> ? O : M;
+    const int DefaultStrideE1  = ck::is_same_v<E1Layout, Row> ? O : M;
+
+    StrideA0  = (StrideA0 < 0) ? DefaultStrideA0 : StrideA0;
+    StrideB0  = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideD00 = (StrideD00 < 0) ? DefaultStrideD00 : StrideD00;
+    StrideD01 = (StrideD01 < 0) ? DefaultStrideD01 : StrideD01;
+    StrideB1  = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideD1  = (StrideD1 < 0) ? DefaultStrideD1 : StrideD1;
+    StrideE1  = (StrideE1 < 0) ? DefaultStrideE1 : StrideE1;
+
+    const int DefaultBatchStrideA0  = (ck::is_same_v<A0Layout, Col> ? K : M) * StrideA0;
+    const int DefaultBatchStrideB0  = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideD00 = (ck::is_same_v<D00Layout, Col> ? N : M) * StrideD00;
+    const int DefaultBatchStrideD01 = (ck::is_same_v<D01Layout, Col> ? N : M) * StrideD01;
+    const int DefaultBatchStrideB1  = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideD1  = (ck::is_same_v<D1Layout, Col> ? O : M) * StrideD1;
+    const int DefaultBatchStrideE1  = (ck::is_same_v<E1Layout, Col> ? O : M) * StrideE1;
+
+    BatchStrideA0  = BatchStrideA0 < 0 ? DefaultBatchStrideA0 : BatchStrideA0;
+    BatchStrideB0  = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideD00 = BatchStrideD00 < 0 ? DefaultBatchStrideD00 : BatchStrideD00;
+    BatchStrideD01 = BatchStrideD01 < 0 ? DefaultBatchStrideD01 : BatchStrideD01;
+    BatchStrideB1  = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideD1  = BatchStrideD1 < 0 ? DefaultBatchStrideD1 : BatchStrideD1;
+    BatchStrideE1  = BatchStrideE1 < 0 ? DefaultBatchStrideE1 : BatchStrideE1;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+
+    // E_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<A0DataType> a0_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA0, BatchStrideA0, A0Layout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<D00DataType> d00_g_m_n(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideD00, BatchStrideD00, D00Layout{}));
+    Tensor<D01DataType> d01_g_m_n(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideD01, BatchStrideD01, D01Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<D1DataType> d1_g_m_o(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideD1, BatchStrideD1, D1Layout{}));
+    Tensor<E1DataType> e1_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideE1, BatchStrideE1, E1Layout{}));
+    Tensor<E1DataType> e1_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideE1, BatchStrideE1, E1Layout{}));
+
+    std::cout << "a0_g_m_k: " << a0_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "d00_g_m_n: " << d00_g_m_n.mDesc
+              << " size: " << d00_g_m_n.mDesc.GetElementSpaceSize() << std::endl;
+    std::cout << "d01_g_m_n: " << d01_g_m_n.mDesc
+              << " size: " << d01_g_m_n.mDesc.GetElementSpaceSize() << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "e1_g_m_o: " << e1_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 3});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 3});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_2<D00DataType>{-2, 3});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_2<D01DataType>{-2, 3});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 3});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 3});
+        break;
+    case 2:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_3<D00DataType>{0.0, 1.0});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_3<D01DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        break;
+    default:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_1<D00DataType>{1});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_1<D01DataType>{1});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_1<D1DataType>{1});
+    }
+
+    DeviceMem a0_g_m_k_device_buf(sizeof(A0DataType) * a0_g_m_k.mDesc.GetElementSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
+    DeviceMem d00_g_m_n_device_buf(sizeof(D00DataType) * d00_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d01_g_m_n_device_buf(sizeof(D01DataType) * d01_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
+    DeviceMem e1_g_m_o_device_buf(sizeof(E1DataType) *
+                                  e1_g_m_o_device_result.mDesc.GetElementSize());
+    DeviceMem d1_g_m_o_device_buf(sizeof(D1DataType) * d1_g_m_o.mDesc.GetElementSpaceSize());
+
+    a0_g_m_k_device_buf.ToDevice(a0_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    d00_g_m_n_device_buf.ToDevice(d00_g_m_n.mData.data());
+    d01_g_m_n_device_buf.ToDevice(d01_g_m_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+    d1_g_m_o_device_buf.ToDevice(d1_g_m_o.mData.data());
+
+    auto a0_element_op   = A0ElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto cde0_element_op = CDE0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto cde1_element_op = CDE1ElementOp{};
+
+    // do GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    auto argument =
+        gemm.MakeArgument(static_cast<A0DataType*>(a0_g_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+                          std::array<const void*, 2>{d00_g_m_n_device_buf.GetDeviceBuffer(),
+                                                     d01_g_m_n_device_buf.GetDeviceBuffer()},
+                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+                          std::array<const void*, 1>{d1_g_m_o_device_buf.GetDeviceBuffer()},
+                          static_cast<E1DataType*>(e1_g_m_o_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          O,
+                          BatchCount,
+                          StrideA0,
+                          StrideB0,
+                          std::array<ck::index_t, 2>{StrideD00, StrideD01},
+                          StrideB1,
+                          std::array<ck::index_t, 1>{StrideD1},
+                          StrideE1,
+                          BatchStrideA0,
+                          BatchStrideB0,
+                          std::array<ck::index_t, 2>{BatchStrideD00, BatchStrideD01},
+                          BatchStrideB1,
+                          std::array<ck::index_t, 1>{BatchStrideD1},
+                          BatchStrideE1,
+                          a0_element_op,
+                          b0_element_op,
+                          cde0_element_op,
+                          b1_element_op,
+                          cde1_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype =
+        (sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(D00DataType) * N +
+         sizeof(D01DataType) * N + sizeof(B1DataType) * N * O + sizeof(E1DataType) * M * O +
+         sizeof(D1DataType) * O) *
+        BatchCount;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e1_g_m_o_device_buf.FromDevice(e1_g_m_o_device_result.mData.data());
+
+    if(do_verification)
+    {
+        using ReferenceGemm0Instance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<A0DataType,
+                                                             B0DataType,
+                                                             Acc0DataType,
+                                                             Acc0DataType,
+                                                             A0ElementOp,
+                                                             B0ElementOp,
+                                                             PassThrough>;
+
+        using ReferenceGemm1Instance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<Acc0DataType,
+                                                             B1DataType,
+                                                             Acc1DataType,
+                                                             Acc1DataType,
+                                                             PassThrough,
+                                                             B1ElementOp,
+                                                             PassThrough>;
+
+        // Output of Gemm0 is input A of Gemm1
+        Tensor<Acc0DataType> c0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        Tensor<Acc0DataType> e0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        Tensor<Acc1DataType> c1_g_m_o(f_host_tensor_descriptor(BatchCount, M, O, O, M * O, Row{}));
+
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a0_g_m_k, b0_g_k_n, c0_g_m_n, a0_element_op, b0_element_op, PassThrough{});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        // bias+bias+relu
+        e0_g_m_n.ForEach([&](auto&, auto idx) {
+            cde0_element_op(e0_g_m_n(idx), c0_g_m_n(idx), d00_g_m_n(idx), d01_g_m_n(idx));
+        });
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            e0_g_m_n, b1_g_n_o, c1_g_m_o, PassThrough{}, b1_element_op, PassThrough{});
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        // bias
+        e1_g_m_o_host_result.ForEach([&](auto&, auto idx) {
+            cde1_element_op(e1_g_m_o_host_result(idx), c1_g_m_o(idx), d1_g_m_o(idx));
+        });
+
+        return ck::utils::check_err(e1_g_m_o_device_result.mData, e1_g_m_o_host_result.mData) ? 0
+                                                                                              : 1;
+    }
+
+    return 0;
+}
--- a/example/38_grouped_conv_bwd_data_bias_relu/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_bias_relu/CMakeLists.txt
+add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
--- a/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_common.hpp
+++ b/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+template <ck::index_t NDimSpatial,
+          typename OutDataType,
+          typename WeiDataType,
+          typename BiasDataType,
+          typename InDataType,
+          typename OutElementOp,
+          typename WeiElementOp,
+          typename InElementOp,
+          typename DeviceInstance>
+int run_conv_bwd_data_bias_relu(bool do_verification,
+                                int init_method,
+                                bool time_kernel,
+                                const ck::utils::conv::ConvParam& conv_param,
+                                const HostTensorDescriptor& out_g_n_k_wos_desc,
+                                const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                                const HostTensorDescriptor& bias_g_n_c_wis_desc,
+                                const HostTensorDescriptor& in_g_n_c_wis_desc,
+                                const OutElementOp& out_element_op,
+                                const WeiElementOp& wei_element_op,
+                                const InElementOp& in_element_op)
+{
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<BiasDataType> bias(bias_g_n_c_wis_desc);
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_device(in_g_n_c_wis_desc);
+
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "in: " << in_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{0.0, 1.0});
+    }
+
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+
+    // reset input to zero
+    in_device_buf.SetZero();
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+
+    copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_n_c_wis_desc.GetLengths(), d0_g_n_c_wis_lengths);
+    copy(bias_g_n_c_wis_desc.GetStrides(), d0_g_n_c_wis_strides);
+    copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do conv
+    auto conv     = DeviceInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        out_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
+        in_device_buf.GetDeviceBuffer(),
+        a_g_n_k_wos_lengths,
+        a_g_n_k_wos_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{d0_g_n_c_wis_lengths},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{d0_g_n_c_wis_strides},
+        e_g_n_c_wis_lengths,
+        e_g_n_c_wis_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        out_element_op,
+        wei_element_op,
+        in_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        printf("wrong! device_conv with the specified compilation parameters does "
+               "not support this Conv problem\n");
+
+        return 1;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+        // c doesn't physically exist, any layout is fine
+        Tensor<float> c_host(in_g_n_c_wis_desc);
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                         float,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         PassThrough,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>();
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(c_host,
+                                                  wei,
+                                                  out,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  PassThrough{},
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        // TODO: implement elementwise operation for host
+        in_host.ForEach(
+            [&](auto&, auto idx) { in_element_op(in_host(idx), c_host(idx), bias(idx)); });
+
+        in_device_buf.FromDevice(in_device.mData.data());
+
+        return ck::utils::check_err(in_device.mData, in_host.mData) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "grouped_conv_bwd_data_bias_relu_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using OutDataType      = ck::half_t;
+using WeiDataType      = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using BiasDataType     = ck::half_t; // bias
+using InDataType       = ck::half_t;
+
+using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout = ck::tensor_layout::convolution::G_C;
+using InLayout   = ck::tensor_layout::convolution::GNHWC;
+
+using OutElementOp     = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp     = ck::tensor_operation::element_wise::PassThrough;
+using CBiasInElementOp = ck::tensor_operation::element_wise::AddRelu;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+template <ck::index_t NDimSpatial>
+using DeviceConvNdBwdDataInstance =
+    ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<
+        NDimSpatial,
+        OutLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout>,
+        InLayout,
+        OutDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType>,
+        InDataType,
+        OutElementOp,
+        WeiElementOp,
+        CBiasInElementOp,
+        ConvBwdDataDefault,
+        true, // DoPadGemmM
+        true, // DoPadGemmN
+        1,
+        256,
+        128,
+        256,
+        32,
+        8,
+        2,
+        32,
+        32,
+        2,
+        4,
+        S<4, 64, 1>,
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        1,
+        S<4, 64, 1>,
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        0,
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+
+    print_helper_msg();
+
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::utils::conv::ConvParam conv_param{
+        2, 2, 128, 256, 256, {3, 3}, {14, 14}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = CBiasInElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(conv_param.num_dim_spatial_ == 2)
+    {
+        // output image: GNHWK
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        // weight: GKYXC
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        // input image bias: G_C
+        const auto bias_g_n_c_wis_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.C_,
+                                  conv_param.input_spatial_lengths_[0],
+                                  conv_param.input_spatial_lengths_[1]},
+                                 {
+                                     conv_param.C_, // g
+                                     0,             // n
+                                     1,             // c
+                                     0,             // hi
+                                     0              // wi
+                                 });
+
+        // input image: GNHWC
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        using DeviceInstance = DeviceConvNdBwdDataInstance<2>;
+
+        run_conv_bwd_data_bias_relu<2,
+                                    OutDataType,
+                                    WeiDataType,
+                                    BiasDataType,
+                                    InDataType,
+                                    OutElementOp,
+                                    WeiElementOp,
+                                    CBiasInElementOp,
+                                    DeviceInstance>(do_verification,
+                                                    init_method,
+                                                    time_kernel,
+                                                    conv_param,
+                                                    out_g_n_k_wos_desc,
+                                                    wei_g_k_c_xs_desc,
+                                                    bias_g_n_c_wis_desc,
+                                                    in_g_n_c_wis_desc,
+                                                    wei_element_op,
+                                                    out_element_op,
+                                                    in_element_op);
+    }
+
+    return 0;
+}
--- a/example/39_permute/CMakeLists.txt
+++ b/example/39_permute/CMakeLists.txt
+add_custom_target(example_permute)
+
+add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp)
+add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp)
+add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)
+
+add_dependencies(example_permute example_permute_1xHxW_fp16)
+add_dependencies(example_permute example_permute_NxHxW_fp16)
+add_dependencies(example_permute example_permute_HxWx4_fp16)
--- a/example/39_permute/common.hpp
+++ b/example/39_permute/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/utility/type.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using F64 = double;
+
+struct Problem final
+{
+    static constexpr std::size_t NumDim = 3;
+
+    using Shape = std::array<std::size_t, NumDim>;
+    using Axes  = Shape;
+
+    Problem() = delete;
+
+    explicit Problem(const Shape& default_shape, const Axes& default_axes)
+        : shape(default_shape), axes(default_axes)
+    {
+    }
+
+    Shape shape;
+    Axes axes;
+};
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+namespace detail {
+
+template <typename Array, std::size_t Difference>
+struct enlarge_array_size;
+
+template <typename T, std::size_t Size, std::size_t Difference>
+struct enlarge_array_size<std::array<T, Size>, Difference>
+{
+    using type = std::array<T, Size + Difference>;
+};
+
+template <typename Array, std::size_t Difference>
+using enlarge_array_size_t = typename enlarge_array_size<Array, Difference>::type;
+
+template <typename Array>
+struct get_array_size;
+
+template <typename T, std::size_t Size>
+struct get_array_size<std::array<T, Size>> : std::integral_constant<std::size_t, Size>
+{
+};
+
+template <typename Array>
+inline constexpr std::size_t get_array_size_v = get_array_size<Array>::value;
+
+template <typename T, typename = void>
+struct is_iterator : std::false_type
+{
+};
+
+template <typename T>
+struct is_iterator<T,
+                   std::void_t<decltype(*std::declval<T>()),
+                               decltype(++std::declval<std::add_lvalue_reference_t<T>>()),
+                               decltype(std::declval<std::add_lvalue_reference_t<T>>()++)>>
+    : std::true_type
+{
+};
+
+template <typename T>
+inline constexpr bool is_iterator_v = is_iterator<T>::value;
+
+struct Placeholder final
+{
+    template <typename T>
+    constexpr inline operator T() const noexcept;
+};
+
+template <typename Iterator, typename = void>
+struct is_output_iterator : std::false_type
+{
+};
+
+template <typename Iterator>
+struct is_output_iterator<
+    Iterator,
+    std::void_t<decltype(*std::declval<Iterator>() = std::declval<Placeholder>())>>
+    : std::bool_constant<is_iterator_v<Iterator>>
+{
+};
+
+template <typename T>
+inline constexpr bool is_output_iterator_v = is_output_iterator<T>::value;
+
+template <typename Iterator, typename = void>
+struct is_bidirectional_iterator : std::false_type
+{
+};
+
+template <typename Iterator>
+struct is_bidirectional_iterator<
+    Iterator,
+    std::void_t<decltype(--std::declval<std::add_lvalue_reference_t<Iterator>>()),
+                decltype(std::declval<std::add_lvalue_reference_t<Iterator>>()--)>>
+    : std::bool_constant<is_iterator_v<Iterator>>
+{
+};
+
+template <typename Iterator>
+inline constexpr bool is_bidirectional_iterator_v = is_bidirectional_iterator<Iterator>::value;
+
+template <typename Iterator, typename = void>
+struct is_random_access_iterator : std::false_type
+{
+};
+
+template <typename Iterator>
+struct is_random_access_iterator<Iterator,
+                                 std::void_t<decltype(std::declval<Iterator>() + 1),
+                                             decltype(std::declval<Iterator>() - 1),
+                                             decltype(std::declval<Iterator>()[1])>>
+    : std::bool_constant<is_iterator_v<Iterator>>
+{
+};
+
+template <typename Iterator>
+inline constexpr bool is_random_access_iterator_v = is_random_access_iterator<Iterator>::value;
+
+template <typename T, typename = void>
+struct is_range : std::false_type
+{
+};
+
+template <typename T>
+struct is_range<T,
+                std::void_t<decltype(begin(std::declval<T>())),
+                            decltype(end(std::declval<T>())),
+                            decltype(begin(std::declval<T>()) != end(std::declval<T>()))>>
+    : std::bool_constant<is_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<T>()))>>>
+{
+};
+
+template <typename T>
+inline constexpr bool is_range_v = is_range<T>::value;
+
+template <typename Range, typename = void>
+struct is_sized_range : std::false_type
+{
+};
+
+template <typename Range>
+struct is_sized_range<Range, std::void_t<decltype(size(std::declval<Range>()))>>
+    : std::bool_constant<is_range_v<Range>>
+{
+};
+
+template <typename Range>
+inline constexpr bool is_sized_range_v = is_sized_range<Range>::value;
+
+template <typename Range, typename = void>
+struct is_bidirectional_range : std::false_type
+{
+};
+
+template <typename Range>
+struct is_bidirectional_range<Range, std::void_t<>>
+    : std::bool_constant<
+          is_range_v<Range> &&
+          is_bidirectional_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<Range>()))>>>
+{
+};
+
+template <typename Range>
+inline constexpr bool is_bidirectional_range_v = is_bidirectional_range<Range>::value;
+
+template <typename Range, typename = void>
+struct is_random_access_range : std::false_type
+{
+};
+
+template <typename Range>
+struct is_random_access_range<Range, std::void_t<>>
+    : std::bool_constant<
+          is_range_v<Range> &&
+          is_random_access_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<Range>()))>>>
+{
+};
+
+template <typename Range>
+inline constexpr bool is_random_access_range_v = is_random_access_range<Range>::value;
+
+template <typename Range>
+class to_array_proxy
+{
+    static_assert(is_range_v<Range>);
+
+    public:
+    explicit to_array_proxy(const Range& source) noexcept : source_(source) {}
+
+    template <typename T, std::size_t Size>
+    operator std::array<T, Size>() const
+    {
+        std::array<T, Size> destination;
+
+        std::copy_n(std::begin(source_),
+                    std::min<std::size_t>(Size, std::size(source_)),
+                    std::begin(destination));
+
+        return destination;
+    }
+
+    private:
+    const Range& source_;
+};
+
+} // namespace detail
+
+template <typename Range>
+inline auto to_array(Range& range) noexcept
+    -> std::enable_if_t<detail::is_range_v<Range>,
+                        detail::to_array_proxy<ck::remove_cvref_t<Range>>>
+{
+    return detail::to_array_proxy<ck::remove_cvref_t<Range>>{range};
+}
+
+namespace ranges {
+template <typename InputRange, typename OutputIterator>
+inline auto copy(InputRange&& range, OutputIterator iter)
+    -> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
+                          std::end(std::forward<InputRange>(range)),
+                          iter))
+{
+    return std::copy(std::begin(std::forward<InputRange>(range)),
+                     std::end(std::forward<InputRange>(range)),
+                     iter);
+}
+} // namespace ranges
+
+template <typename Axes>
+inline auto is_valid_axes(const Axes& axes)
+    -> std::enable_if_t<detail::is_random_access_range_v<Axes>, bool>
+{
+    using std::empty;
+    if(empty(axes))
+    {
+        return false;
+    }
+
+    using std::begin, std::end;
+    std::vector<std::size_t> sorted_axes(begin(axes), end(axes));
+
+    std::sort(begin(sorted_axes), end(sorted_axes));
+    const auto last = std::unique(begin(sorted_axes), end(sorted_axes));
+
+    return (last == end(sorted_axes)) && (*begin(sorted_axes) == 0) &&
+           (*std::prev(last) == size(axes) - 1);
+}
+
+template <typename Shape>
+inline auto is_valid_shape(const Shape& shape) -> std::enable_if_t<detail::is_range_v<Shape>, bool>
+{
+    static_assert(std::is_unsigned_v<ck::remove_cvref_t<decltype(*std::begin(shape))>>);
+
+    using std::begin, std::end;
+    using std::empty;
+    return !empty(shape) && std::all_of(begin(shape), end(shape), [](auto dim) { return 0 < dim; });
+}
+
+template <typename Shape, typename Indices>
+inline auto is_valid_indices(const Shape& shape, const Indices& indices)
+    -> std::enable_if_t<detail::is_sized_range_v<Shape> && detail::is_sized_range_v<Indices>, bool>
+{
+    static_assert(std::is_unsigned_v<ck::remove_cvref_t<decltype(*std::begin(indices))>>);
+
+    if(!is_valid_shape(shape))
+    {
+        return false;
+    }
+
+    using std::empty;
+    if(empty(indices))
+    {
+        return false;
+    }
+
+    using std::size;
+    if(size(shape) != size(indices))
+    {
+        return false;
+    }
+
+    using std::begin, std::end;
+
+    auto dim = begin(shape);
+    auto idx = begin(indices);
+    for(; dim != end(shape) && idx != end(indices); ++dim, ++idx)
+    {
+        if(*dim <= *idx)
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+template <std::size_t Size>
+std::array<std::size_t, Size> transpose(const std::array<std::size_t, Size>& shape,
+                                        const std::array<std::size_t, Size>& axes)
+{
+    assert(is_valid_shape(shape) && is_valid_axes(axes));
+
+    std::array<std::size_t, Size> transposed;
+    auto iter = std::begin(transposed);
+    for(const auto axis : axes)
+    {
+        *iter++ = shape[axis];
+    }
+
+    return transposed;
+}
+
+auto extend_shape(const Problem::Shape& shape, std::size_t new_dim)
+{
+    detail::enlarge_array_size_t<Problem::Shape, 1> extended_shape;
+
+    using std::begin, std::end;
+
+    std::copy(begin(shape), end(shape), begin(extended_shape));
+    extended_shape.back() = new_dim;
+
+    return extended_shape;
+}
+
+auto extend_axes(const Problem::Axes& axes)
+{
+    detail::enlarge_array_size_t<Problem::Axes, 1> extended_axes;
+
+    using std::begin, std::end;
+
+    std::copy(begin(axes), end(axes), begin(extended_axes));
+    extended_axes.back() = detail::get_array_size_v<Problem::Axes>;
+
+    return extended_axes;
+}
+
+template <typename Shape, typename Indices>
+auto advance_indices(const Shape& shape, Indices& indices) -> std::enable_if_t<
+    detail::is_bidirectional_range_v<Shape> && detail::is_sized_range_v<Shape> &&
+        detail::is_bidirectional_range_v<Indices> && detail::is_sized_range_v<Indices>,
+    bool>
+{
+    using std::size;
+    if(!(is_valid_shape(shape) && is_valid_indices(shape, indices) && size(shape) == size(indices)))
+    {
+        return false;
+    }
+
+    bool carry = true;
+
+    using std::rbegin, std::rend;
+    auto dim = rbegin(shape);
+    auto idx = rbegin(indices);
+    for(; carry && dim != rend(shape) && idx != rend(indices); ++dim, ++idx)
+    {
+        *idx  = (*idx + carry);
+        carry = ((*idx == *dim) ? (*idx = 0, true) : false);
+    }
+
+    return !carry;
+}
+
+template <typename Src, typename Axes, typename Functor, typename Dest>
+auto host_permute(const Tensor<Src>& src, const Axes& axes, Functor functor, Tensor<Dest>& dest)
+    -> std::enable_if_t<detail::is_random_access_range_v<Axes> && detail::is_sized_range_v<Axes> &&
+                            std::is_invocable_v<Functor,
+                                                std::add_lvalue_reference_t<Dest>,
+                                                std::add_lvalue_reference_t<Src>>,
+                        bool>
+{
+    const auto& shape            = src.mDesc.GetLengths();
+    const auto& transposed_shape = dest.mDesc.GetLengths();
+    if(!(is_valid_shape(shape) && is_valid_shape(transposed_shape)))
+    {
+        return false;
+    }
+
+    using std::size;
+    if(!is_valid_axes(axes))
+    {
+        return false;
+    }
+
+    static_assert(detail::is_sized_range_v<ck::remove_cvref_t<decltype(shape)>> &&
+                  detail::is_sized_range_v<ck::remove_cvref_t<decltype(transposed_shape)>>);
+
+    if(size(shape) != size(transposed_shape))
+    {
+        return false;
+    }
+
+    static_assert(detail::is_random_access_range_v<ck::remove_cvref_t<decltype(shape)>> &&
+                  detail::is_random_access_range_v<ck::remove_cvref_t<decltype(transposed_shape)>>);
+    {
+        for(std::size_t idx = 0; idx < size(shape); ++idx)
+        {
+            if(transposed_shape[idx] != shape[axes[idx]])
+            {
+                return false;
+            }
+        }
+    }
+
+    std::vector<std::size_t> indices(size(shape), 0);
+    if(!is_valid_indices(shape, indices))
+    {
+        return false;
+    }
+
+    switch(size(shape))
+    {
+    case 3: {
+        do
+        {
+            Dest output = 0;
+            functor(output, src(indices[0], indices[1], indices[2]));
+            dest(indices[axes[0]], indices[axes[1]], indices[axes[2]]) = output;
+        } while(advance_indices(shape, indices));
+    }
+    break;
+    case 4: {
+        do
+        {
+            Dest output = 0;
+            functor(output, src(indices[0], indices[1], indices[2], indices[3]));
+            dest(indices[axes[0]], indices[axes[1]], indices[axes[2]], indices[axes[3]]) = output;
+        } while(advance_indices(shape, indices));
+    }
+    break;
+    default: return false;
+    }
+
+    return true;
+}
--- a/example/39_permute/permute_1xHxW_fp16.cpp
+++ b/example/39_permute/permute_1xHxW_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using InDataType  = F16;
+using OutDataType = F16;
+
+// clang-format off
+using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
+// ######| NumDim|     InData|     OutData| Elementwise| Block|  NPer|  HPer|  WPer|   InBlock|      InBlockTransfer|           InBlockTransfer|       Src|       Dst|             Src|             Dst|
+// ######|       |       Type|        Type|   Operation|  Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+         <      3, InDataType, OutDataType, PassThrough,   256,     1,    32,    32,         3,         S<1, 32,  8>,                S<0, 1, 2>,         2,         1,               2,               1>;
+// clang-format on
+
+#include "run_permute_element_example.inc"
+
+int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); }
--- a/example/39_permute/permute_HxWx4_fp16.cpp
+++ b/example/39_permute/permute_HxWx4_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using DataType   = F16;
+using BundleType = F64;
+
+static_assert(sizeof(BundleType) % sizeof(DataType) == 0);
+
+// clang-format off
+using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
+// ######| NumDim|     InData|     OutData| Elementwise| Block|  NPer|  HPer|  WPer|   InBlock|      InBlockTransfer|           InBlockTransfer|       Src|       Dst|             Src|             Dst|
+// ######|       |       Type|        Type|   Operation|  Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+         <       3, BundleType, BundleType, PassThrough,   256,     1,    32,    32,         5,         S<1, 32,  8>,                S<0, 1, 2>,         2,         1,               4,               1>;
+// clang-format on
+
+#include "run_permute_bundle_example.inc"
+
+int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); }
--- a/example/39_permute/permute_NxHxW_fp16.cpp
+++ b/example/39_permute/permute_NxHxW_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using InDataType  = F16;
+using OutDataType = F16;
+
+// clang-format off
+using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
+// ######| NumDim|     InData|     OutData| Elementwise| Block|  NPer|  HPer|  WPer|   InBlock|      InBlockTransfer|           InBlockTransfer|       Src|       Dst|             Src|             Dst|
+// ######|       |       Type|        Type|   Operation|  Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+         <      3, InDataType, OutDataType, PassThrough,   128,     4,    16,     8,         6,          S<2, 16, 4>,                S<0, 1, 2>,         2,         1,               2,               1>;
+// clang-format on
+
+#include "run_permute_element_example.inc"
+
+int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); }
--- a/example/39_permute/run_permute_bundle_example.inc
+++ b/example/39_permute/run_permute_bundle_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+bool run_permute_bundle(const Problem& problem)
+{
+    const auto& input_bundle_shape = problem.shape;
+    const auto& input_bundle_axes  = problem.axes;
+
+    const auto output_bundle_shape = transpose(input_bundle_shape, input_bundle_axes);
+
+    Tensor<BundleType> input_bundle_tensor(input_bundle_shape);
+    Tensor<BundleType> output_bundle_tensor(output_bundle_shape);
+
+    // initialize tensor by assigning DataType values
+    ck::utils::FillUniformDistribution<DataType>{-1.f, 1.f}(input_bundle_tensor.AsSpan<DataType>());
+
+    DeviceMem input_device_buf(input_bundle_tensor.GetElementSpaceSizeInBytes());
+    DeviceMem output_device_buf(output_bundle_tensor.GetElementSpaceSizeInBytes());
+
+    using std::data;
+    input_device_buf.ToDevice(data(input_bundle_tensor));
+
+    static_assert(std::is_default_constructible_v<DevicePermuteInstance>);
+
+    auto permute  = DevicePermuteInstance{};
+    auto argument = permute.MakeArgument(to_array(input_bundle_shape),
+                                         to_array(input_bundle_tensor.GetStrides()),
+                                         to_array(output_bundle_shape),
+                                         to_array(output_bundle_tensor.GetStrides()),
+                                         input_device_buf.GetDeviceBuffer(),
+                                         output_device_buf.GetDeviceBuffer(),
+                                         PassThrough{});
+
+    if(!permute.IsSupportedArgument(argument))
+    {
+        std::cerr << "The runtime parameters seems not supported by the device instance, exiting!"
+                  << std::endl;
+        return false;
+    };
+
+    auto invoker   = permute.MakeInvoker();
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
+
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+
+    output_device_buf.FromDevice(data(output_bundle_tensor));
+
+    constexpr std::size_t NumElemsInBundle = sizeof(BundleType) / sizeof(DataType);
+
+    // extend tensor shape from [N, H, W] to [N, H, W, NumElemsInBundle]
+    //               axes  from [0, 2, 1] to [0, 2, 1, 3]
+    const auto input_shape = extend_shape(input_bundle_shape, NumElemsInBundle);
+    const auto input_axes  = extend_axes(input_bundle_axes);
+
+    using std::begin;
+
+    Tensor<DataType> input_tensor(input_shape);
+    ranges::copy(input_bundle_tensor.AsSpan<const DataType>(), begin(input_tensor));
+
+    Tensor<DataType> output_tensor(transpose(input_shape, input_axes));
+    if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor))
+    {
+        return false;
+    }
+
+    return ck::utils::check_err(output_bundle_tensor.AsSpan<const DataType>(),
+                                output_tensor.AsSpan<const DataType>(),
+                                "Error: incorrect results in output tensor",
+                                1e-6,
+                                1e-6);
+}
+
+bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes)
+{
+    return run_permute_bundle(Problem{shape, axes});
+}
--- a/example/39_permute/run_permute_element_example.inc
+++ b/example/39_permute/run_permute_element_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+bool run_permute_element(const Problem& problem)
+{
+    const auto& input_shape = problem.shape;
+    const auto& input_axes  = problem.axes;
+
+    const auto output_shape = transpose(input_shape, input_axes);
+
+    Tensor<InDataType> input_tensor(input_shape);
+    Tensor<OutDataType> output_tensor(output_shape);
+
+    ck::utils::FillUniformDistribution<InDataType>{-1.f, 1.f}(input_tensor);
+
+    DeviceMem input_device_buf(input_tensor.GetElementSpaceSizeInBytes());
+    DeviceMem output_device_buf(output_tensor.GetElementSpaceSizeInBytes());
+
+    using std::data;
+    input_device_buf.ToDevice(data(input_tensor));
+
+    static_assert(std::is_default_constructible_v<DevicePermuteInstance>);
+
+    auto permute  = DevicePermuteInstance{};
+    auto argument = permute.MakeArgument(to_array(input_shape),
+                                         to_array(input_tensor.GetStrides()),
+                                         to_array(output_shape),
+                                         to_array(output_tensor.GetStrides()),
+                                         input_device_buf.GetDeviceBuffer(),
+                                         output_device_buf.GetDeviceBuffer(),
+                                         PassThrough{});
+
+    if(!permute.IsSupportedArgument(argument))
+    {
+        std::cerr << "The runtime parameters seems not supported by the device instance, exiting!"
+                  << std::endl;
+        return false;
+    };
+
+    auto invoker   = permute.MakeInvoker();
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
+
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+
+    output_device_buf.FromDevice(data(output_tensor));
+
+    Tensor<OutDataType> output_tensor_host(output_shape);
+    if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor_host))
+    {
+        return false;
+    }
+
+    return ck::utils::check_err(output_tensor.AsSpan<const OutDataType>(),
+                                output_tensor_host.AsSpan<const OutDataType>(),
+                                "Error: incorrect results in output tensor",
+                                1e-6,
+                                1e-6);
+}
+
+bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes)
+{
+    return run_permute_element(Problem{shape, axes});
+}
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
+add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp)
--- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType     = ck::half_t;
+using GammaDataType = ck::half_t;
+using BetaDataType  = ck::half_t;
+using YDataType     = ck::half_t;
+using AccDataType   = float;
+
+struct YElementOp
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
+                          ck::is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        T a;
+
+        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
+
+        y = x * a;
+    };
+};
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceLayernormImpl<XDataType,
+                                                      GammaDataType,
+                                                      BetaDataType,
+                                                      AccDataType,
+                                                      YDataType,
+                                                      YElementOp,
+                                                      Rank,
+                                                      NumReduceDim,
+                                                      256, // BlockSize
+                                                      8,   // ClusterM
+                                                      32,  // ClusterK
+                                                      1,   // SliceM
+                                                      8,   // SliceK
+                                                      1,   // SrcVecDim (0=M, 1=K)
+                                                      8,   // SrcScalarPerVector
+                                                      1,   // GammaVecDim (0=M, 1=K)
+                                                      8,   // GammaScalarPerVector
+                                                      1,   // BetaVecDim (0=M, 1=K)
+                                                      8,   // BetaScalarPerVector
+                                                      8>;  // OutScalarPerVector
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 128;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 32;
+    ck::index_t C = 40;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 6)
+    {
+        N = std::stoi(argv[1]);
+        H = std::stoi(argv[2]);
+        W = std::stoi(argv[3]);
+        G = std::stoi(argv[4]);
+        C = std::stoi(argv[5]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+
+        return 1;
+    }
+
+    Tensor<XDataType> x({N, H, W, G, C});
+    Tensor<YDataType> y({N, H, W, G, C});
+    Tensor<GammaDataType> gamma({G, C});
+    Tensor<BetaDataType> beta({G, C});
+
+    ck::utils::FillUniformDistribution<XDataType>{0.f, 1.f}(x.begin(), x.end());
+    ck::utils::FillUniformDistribution<GammaDataType>{0.f, 1.f}(gamma.begin(), gamma.end());
+    ck::utils::FillUniformDistribution<BetaDataType>{0.f, 1.f}(beta.begin(), beta.end());
+
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    const auto y_element_op = YElementOp{};
+
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {N, H, W, G, C},
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+        {0, 0, 0, C, 1},
+        {0, 0, 0, C, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        {1, 2, 4}, // reduction dimension: [H, W, C]
+        1e-6,
+        x_dev.GetDeviceBuffer(),
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+        y_element_op);
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
+
+    std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
+                            sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
+                            sizeof(BetaDataType) * G * C;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, "
+              << device_instance.GetTypeString() << std::endl;
+
+    bool pass = true;
+    {
+        Tensor<YDataType> host_y({N, H, W, G, C});
+        using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
+                                                                                 GammaDataType,
+                                                                                 BetaDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 YElementOp>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        y_dev.FromDevice(y.mData.data());
+        pass &= ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
+    }
+
+    return (pass ? 0 : 1);
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -21,35 +21,10 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    add_dependencies(examples ${EXAMPLE_NAME})
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)

-add_subdirectory(01_gemm)
-add_subdirectory(02_gemm_bilinear)
-add_subdirectory(03_gemm_bias_relu)
-add_subdirectory(04_gemm_add_add_fastgelu)
-add_subdirectory(09_convnd_fwd)
-add_subdirectory(10_convnd_fwd_multiple_d_multiple_reduce)
-add_subdirectory(12_reduce)
-add_subdirectory(13_pool2d_fwd)
-add_subdirectory(14_gemm_xdl_requant_relu_requant)
-add_subdirectory(15_grouped_gemm)
-add_subdirectory(16_gemm_multi_d_multi_reduces)
-add_subdirectory(17_convnd_bwd_data)
-add_subdirectory(18_batched_gemm_reduce)
-add_subdirectory(19_binary_elementwise)
-add_subdirectory(20_convnd_bwd_weight)
-add_subdirectory(21_gemm_layernorm)
-add_subdirectory(22_cgemm)
-add_subdirectory(23_softmax)
-add_subdirectory(24_batched_gemm)
-add_subdirectory(25_gemm_bias_e_permute)
-add_subdirectory(26_contraction)
-add_subdirectory(27_layernorm)
-add_subdirectory(28_grouped_gemm_bias_e_permute)
-add_subdirectory(29_batched_gemm_bias_e_permute)
-add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
-add_subdirectory(31_batched_gemm_gemm)
-add_subdirectory(32_batched_gemm_scale_softmax_gemm)
-add_subdirectory(33_multiple_reduce)
-add_subdirectory(34_batchnorm)
-add_subdirectory(35_splitK_gemm)
-add_subdirectory(36_sparse_embedding)
-add_subdirectory(41_grouped_conv_conv_fwd)
+# add all example subdir
+file(GLOB dir_list LIST_DIRECTORIES true *)
+FOREACH(subdir ${dir_list})
+    IF(IS_DIRECTORY "${subdir}")
+        add_subdirectory(${subdir})
+    ENDIF()
+ENDFOREACH()
--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
@@ -10,4 +10,5 @@ struct StreamConfig
 {
    hipStream_t stream_id_ = nullptr;
    bool time_kernel_      = false;
+    int log_level_         = 0;
 };
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -649,6 +649,9 @@ struct BlockwiseGemmXdlops_v2
    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);

+    static_assert(KPerThread % KPack == 0,
+                  "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
+
    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                              FloatAcc,
                              MRepeat * NRepeat,