Merge pull request #3 from ROCmSoftwarePlatform/downstream-internal

Downstream from public to internal

Merge pull request #3 from ROCmSoftwarePlatform/downstream-internal
Downstream from public to internal
52f64967 · Illia Silin · GitHub · dda3a0a1 · 246ceee4 · 52f64967
Unverified Commit 52f64967 authored Feb 23, 2023 by Illia Silin Committed by GitHub Feb 23, 2023
20 changed files
--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"

 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

@@ -246,21 +247,11 @@ int main(int argc, char* argv[])
        exit(0);
    }

-    Tensor<ADataType> a_gs_ms_ks(
-        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
-    Tensor<BDataType> b_gs_ns_ks(
-        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
-    Tensor<DDataType> d_gs_ms_ns(
-        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_host_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_device_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);

    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -327,20 +318,14 @@ int main(int argc, char* argv[])

    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

-    std::size_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
-                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    std::size_t M = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});

-    std::size_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
-                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    std::size_t N = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});

-    std::size_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
-                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    std::size_t K = ck::accumulate_n<ck::index_t>(
+        a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});

    std::size_t flop      = std::size_t(2) * M * N * K;
    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
@@ -357,9 +342,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
-            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);

        using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1<NumDimM,
                                                                     NumDimN,
@@ -407,9 +390,7 @@ int main(int argc, char* argv[])
            }
        }

-        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
-                   ? 0
-                   : 1;
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
    }

    return 0;

--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -246,21 +247,11 @@ int main(int argc, char* argv[])
        exit(0);
    }

-    Tensor<ADataType> a_gs_ms_ks(
-        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
-    Tensor<BDataType> b_gs_ns_ks(
-        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
-    Tensor<DDataType> d_gs_ms_ns(
-        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_host_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_device_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);

    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -327,20 +318,14 @@ int main(int argc, char* argv[])

    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

-    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin(),
-                                    e_gs_ms_ns_lengths.begin() + NumDimM,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_gs_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});

-    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimM,
-                                    e_gs_ms_ns_lengths.begin() + NumDimM + NumDimN,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});

-    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimM,
-                                    a_gs_ms_ks_lengths.begin() + NumDimM + NumDimK,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_gs_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});

    std::size_t flop      = std::size_t(2) * M * N * K;
    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
@@ -357,9 +342,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
-            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);

        using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1<NumDimG,
                                                                     NumDimM,
@@ -408,9 +391,7 @@ int main(int argc, char* argv[])
            }
        }

-        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
-                   ? 0
-                   : 1;
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
    }

    return 0;

--- a/example/26_contraction/CMakeLists.txt
+++ b/example/26_contraction/CMakeLists.txt
 add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp)
 add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp)
+
+add_example_executable(example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp)
+add_example_executable(example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp)
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -288,21 +289,11 @@ int main(int argc, char* argv[])
        exit(0);
    }

-    Tensor<ADataType> a_ms_ks(
-        std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
-    Tensor<BDataType> b_ns_ks(
-        std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
-    Tensor<EDataType> d_ms_ns(
-        std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_host_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_device_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);

    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
@@ -368,20 +359,14 @@ int main(int argc, char* argv[])

    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

-    ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
-                                    e_ms_ns_lengths.begin() + NumDimM,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});

-    ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
-                                    e_ms_ns_lengths.begin() + NumDimM + NumDimN,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});

-    ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
-                                    a_ms_ks_lengths.begin() + NumDimM + NumDimK,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});

    std::size_t flop      = std::size_t(2) * M * N * K;
    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
@@ -398,9 +383,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(
-            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
                                                                  NumDimN,
@@ -437,7 +420,7 @@ int main(int argc, char* argv[])
            }
        }

-        return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
    }

    return 0;

--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F64 = double;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DDataType        = F64;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F64;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F64,   F64,     F64,      F64, DsDataType,   F64,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,         1,           1,           1,              S<1, 16, 1, 16>,               1>;
+
+using DeviceOpInstanceKNNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F64,   F64,     F64,      F64, DsDataType,   F64,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>;
+
+using DeviceOpInstanceMKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F64,   F64,     F64,      F64, DsDataType,   F64,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,         1,           1,           1,              S<1, 16, 1, 16>,               1>;
+
+using DeviceOpInstanceMNNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F64,   F64,     F64,      F64, DsDataType,   F64,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
+struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_ms_ks,
+                 const Tensor<BDataType>& b_ns_ks,
+                 Tensor<EDataType>& e_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_ms_ks_{a_ms_ks},
+              b_ns_ks_{b_ns_ks},
+              e_ms_ns_{e_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<BDataType>& b_ns_ks_;
+        Tensor<EDataType>& e_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_M2_N2_K2::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
+                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    for(int k1 = 0; k1 < K1; ++k1)
+                    {
+                        AccDataType v_a;
+                        AccDataType v_b;
+
+                        arg.a_element_op_(
+                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(
+                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
+
+                        v_acc += v_a * v_b;
+                    }
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+                             const Tensor<BDataType>& b_ns_ks,
+                             Tensor<EDataType>& e_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_M2_N2_K2"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float alpha = 1.f;
+    float beta  = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 28)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+
+        d_ms_ns_lengths = {M0, M1, N0, N1};
+        d_ms_ns_strides = {
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
+
+        alpha = std::stof(argv[26]);
+        beta  = std::stof(argv[27]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 7: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+        printf("arg22 to 25: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg26 to 27: alpha, beta\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl;
+    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                  NumDimN,
+                                                                  NumDimK,
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  CShuffleDataType,
+                                                                  AccDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
+                                       c_ms_ns_host_result(m0, m1, n0, n1),
+                                       d_ms_ns(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -277,18 +278,10 @@ int main(int argc, char* argv[])
        exit(0);
    }

-    Tensor<ADataType> a_ms_ks(
-        std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
-    Tensor<BDataType> b_ns_ks(
-        std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
-    Tensor<EDataType> e_ms_ns_host_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_device_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);

    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
@@ -349,20 +342,14 @@ int main(int argc, char* argv[])

    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

-    ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
-                                    e_ms_ns_lengths.begin() + NumDimM,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});

-    ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
-                                    e_ms_ns_lengths.begin() + NumDimM + NumDimN,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});

-    ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
-                                    a_ms_ks_lengths.begin() + NumDimM + NumDimK,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});

    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_btype =
@@ -379,9 +366,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(
-            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
                                                                  NumDimN,
@@ -417,7 +402,7 @@ int main(int argc, char* argv[])
            }
        }

-        return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
    }

    return 0;

--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F64 = double;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F64;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// clang-format off
+using DeviceOpInstanceKKN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F64,   F64,     F64,      F64, DsDataType,   F64,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,         1,           1,           1,              S<1, 16, 1, 16>,               1>;
+
+using DeviceOpInstanceKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F64,   F64,     F64,      F64, DsDataType,   F64,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>;
+
+using DeviceOpInstanceMKN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F64,   F64,     F64,      F64, DsDataType,   F64,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,         1,           1,           1,              S<1, 16, 1, 16>,               1>;
+
+using DeviceOpInstanceMNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F64,   F64,     F64,      F64, DsDataType,   F64,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
+struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_ms_ks,
+                 const Tensor<BDataType>& b_ns_ks,
+                 Tensor<EDataType>& e_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_ms_ks_{a_ms_ks},
+              b_ns_ks_{b_ns_ks},
+              e_ms_ns_{e_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<BDataType>& b_ns_ks_;
+        Tensor<EDataType>& e_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_M2_N2_K2::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
+                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    for(int k1 = 0; k1 < K1; ++k1)
+                    {
+                        AccDataType v_a;
+                        AccDataType v_b;
+
+                        arg.a_element_op_(
+                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(
+                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
+
+                        v_acc += v_a * v_b;
+                    }
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+                             const Tensor<BDataType>& b_ns_ks,
+                             Tensor<EDataType>& e_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_M2_N2_K2"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float scale = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 23)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
+
+        scale = std::stof(argv[22]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg22: scale\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_ns_ks.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{scale};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 0>{},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 0>{},
+                                    std::array<std::vector<ck::index_t>, 0>{},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + +sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                  NumDimN,
+                                                                  NumDimK,
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  CShuffleDataType,
+                                                                  AccDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
+                                       c_ms_ns_host_result(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/27_layernorm/CMakeLists.txt
+++ b/example/27_layernorm/CMakeLists.txt
-add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp)
\ No newline at end of file
+add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp)
--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
@@ -17,14 +17,15 @@
 #include "ck/library/utility/host_common_util.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"

-using XDataType     = ck::half_t;
-using GammaDataType = ck::half_t;
-using BetaDataType  = ck::half_t;
-using YDataType     = ck::half_t;
-using AccDataType   = float;
-using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;

 constexpr int Rank         = 2;
 constexpr int NumReduceDim = 1;
@@ -33,7 +34,7 @@ using DeviceInstance =
    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
                                                          GammaDataType,
                                                          BetaDataType,
-                                                          AccDataType,
+                                                          ComputeDataType,
                                                          YDataType,
                                                          PassThrough,
                                                          Rank,
@@ -60,13 +61,13 @@ int main()
    ck::index_t Stride = N;

    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
-        return HostTensorDescriptor(std::vector<std::size_t>({len}),
-                                    std::vector<std::size_t>({stride}));
+        return HostTensorDescriptor({len}, {stride});
    };

    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
-        return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                    std::vector<std::size_t>({stride, 1}));
+        using namespace ck::literals;
+
+        return HostTensorDescriptor({row, col}, {stride, 1_uz});
    };

    Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
@@ -100,6 +101,8 @@ int main()
        gamma_dev.GetDeviceBuffer(),
        beta_dev.GetDeviceBuffer(),
        y_dev.GetDeviceBuffer(),
+        nullptr,
+        nullptr,
        PassThrough{});

    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
@@ -118,7 +121,7 @@ int main()
                                                                                 GammaDataType,
                                                                                 BetaDataType,
                                                                                 YDataType,
-                                                                                 AccDataType,
+                                                                                 ComputeDataType,
                                                                                 PassThrough,
                                                                                 Rank,
                                                                                 NumReduceDim>;
@@ -130,8 +133,7 @@ int main()
        ref_invoker.Run(ref_argument);

        y_dev.FromDevice(y.mData.data());
-        pass &=
-            ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);
    }
    return (pass ? 0 : 1);
 }
--- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -297,33 +298,19 @@ int main(int argc, char* argv[])
        const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
        const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;

-        Tensor<ADataType> a_ms_ks(
-            std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
-            std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
-        Tensor<BDataType> b_ns_ks(
-            std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
-            std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
-        Tensor<DDataType> d_ms_ns(
-            std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()),
-            std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end()));
-        Tensor<EDataType> e_ms_ns_device_result(
-            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
-
-        ck::index_t M_ = std::accumulate(e_ms_ns_lengths.begin(),
-                                         e_ms_ns_lengths.begin() + NumDimM,
-                                         ck::index_t{1},
-                                         std::multiplies<ck::index_t>{});
-
-        ck::index_t N_ = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
-                                         e_ms_ns_lengths.begin() + NumDimM + NumDimN,
-                                         ck::index_t{1},
-                                         std::multiplies<ck::index_t>{});
-
-        ck::index_t K_ = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
-                                         a_ms_ks_lengths.begin() + NumDimM + NumDimK,
-                                         ck::index_t{1},
-                                         std::multiplies<ck::index_t>{});
+        Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+        Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+        Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
+        Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+        ck::index_t M_ =
+            ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+        ck::index_t N_ = ck::accumulate_n<ck::index_t>(
+            e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+        ck::index_t K_ = ck::accumulate_n<ck::index_t>(
+            a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});

        a_tensors.push_back(a_ms_ks);
        b_tensors.push_back(b_ns_ks);
@@ -423,13 +410,9 @@ int main(int argc, char* argv[])
            const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
            const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;

-            Tensor<EDataType> c_ms_ns_host_result(
-                std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-                std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+            Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-            Tensor<EDataType> e_ms_ns_host_result(
-                std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-                std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+            Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

            e_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data());

@@ -475,7 +458,7 @@ int main(int argc, char* argv[])
                }
            }

-            pass &= ck::utils::check_err(e_device_tensors[i].mData, e_ms_ns_host_result.mData);
+            pass &= ck::utils::check_err(e_device_tensors[i], e_ms_ns_host_result);
        }
    }


--- a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
 add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp)
+
+if(GPU_TARGETS MATCHES "gfx1100")
+    add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp)
+endif()
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 1;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceOpInstanceKKNN =
+    ck::tensor_operation::device::DeviceBatchedContractionMultipleD_Wmma_CShuffle<NumDimG,
+                                                                                  NumDimM,
+                                                                                  NumDimN,
+                                                                                  NumDimK,
+                                                                                  ADataType,
+                                                                                  BDataType,
+                                                                                  DsDataType,
+                                                                                  EDataType,
+                                                                                  AccDataType,
+                                                                                  CShuffleDataType,
+                                                                                  AElementOp,
+                                                                                  BElementOp,
+                                                                                  CDEElementOp,
+                                                                                  GemmSpec,
+                                                                                  ABSpec,
+                                                                                  ABSpec,
+                                                                                  DESpec,
+                                                                                  256,
+                                                                                  128,
+                                                                                  256,
+                                                                                  8,
+                                                                                  8,
+                                                                                  16,
+                                                                                  16,
+                                                                                  4,
+                                                                                  4,
+                                                                                  S<4, 64, 1>,
+                                                                                  S<1, 0, 2>,
+                                                                                  S<1, 0, 2>,
+                                                                                  2,
+                                                                                  8,
+                                                                                  8,
+                                                                                  true,
+                                                                                  S<4, 64, 1>,
+                                                                                  S<1, 0, 2>,
+                                                                                  S<1, 0, 2>,
+                                                                                  2,
+                                                                                  8,
+                                                                                  8,
+                                                                                  true,
+                                                                                  1,
+                                                                                  1,
+                                                                                  S<1, 32, 1, 8>,
+                                                                                  8>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimG,
+          ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimG == 2 && NumDimM == 2 && NumDimN == 2 && NumDimK == 1, bool> =
+              false>
+struct ReferenceContraction_G2_M2_N2_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
+                 Tensor<EDataType>& e_gs_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_gs_ms_ks_{a_gs_ms_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
+              e_gs_ms_ns_{e_gs_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_gs_ms_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
+        Tensor<EDataType>& e_gs_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_G2_M2_N2_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto g0, auto g1, auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a,
+                        ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, g1, m0, m1, k0)));
+                    arg.b_element_op_(
+                        v_b,
+                        ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, g1, n0, n1, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_gs_ms_ns_(g0, g1, m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
+                             Tensor<EDataType>& e_gs_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_G2_M2_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    ck::index_t G0 = 1;
+    ck::index_t G1 = 2;
+
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 128;
+
+    ck::index_t N0 = 16;
+    ck::index_t N1 = 256;
+
+    ck::index_t K0 = 2048;
+
+    // A[G0, G1, M0, M1, K0]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M0, M1, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1};
+    // B[G0, G1, N0, N1, K0]
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, G1, N0, N1, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1};
+    // E[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+        break;
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_gs_ms_ks_lengths,
+                                    a_gs_ms_ks_strides,
+                                    b_gs_ns_ks_lengths,
+                                    b_gs_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
+                                    e_gs_ms_ns_lengths,
+                                    e_gs_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t G =
+        ck::accumulate_n<ck::index_t>(e_gs_ms_ns_lengths.begin(), NumDimG, 1, std::multiplies<>{});
+
+    ck::index_t M = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});
+    std::cout << "GMNK=" << G << ", " << M << ", " << N << ", " << K << std::endl;
+    std::size_t flop      = std::size_t(2) * G * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N +
+                            sizeof(DDataType) * G * M * N + sizeof(EDataType) * G * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+        using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
+                                                                     NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CShuffleDataType,
+                                                                     AccDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_gs_ms_ks, b_gs_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0)
+        {
+            for(size_t g1 = 0; g1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++g1)
+            {
+                for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m0)
+                {
+                    for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++m1)
+                    {
+                        for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n0)
+                        {
+                            for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5];
+                                ++n1)
+                            {
+                                cde_element_op(e_gs_ms_ns_host_result(g0, g1, m0, m1, n0, n1),
+                                               c_ms_ns_host_result(g0, g1, m0, m1, n0, n1),
+                                               d_gs_ms_ns(g0, g1, m0, m1, n0, n1));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -246,21 +247,11 @@ int main(int argc, char* argv[])
        exit(0);
    }

-    Tensor<ADataType> a_gs_ms_ks(
-        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
-    Tensor<BDataType> b_gs_ns_ks(
-        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
-    Tensor<DDataType> d_gs_ms_ns(
-        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_host_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_device_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);

    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -327,25 +318,17 @@ int main(int argc, char* argv[])

    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

-    ck::index_t G = std::accumulate(e_gs_ms_ns_lengths.begin(),
-                                    e_gs_ms_ns_lengths.begin() + NumDimG,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t G =
+        ck::accumulate_n<ck::index_t>(e_gs_ms_ns_lengths.begin(), NumDimG, 1, std::multiplies<>{});

-    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
-                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t M = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});

-    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
-                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});

-    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
-                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});

    std::size_t flop      = std::size_t(2) * G * M * N * K;
    std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N +
@@ -362,9 +345,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(
-            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);

        using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
                                                                     NumDimM,
@@ -409,9 +390,7 @@ int main(int argc, char* argv[])
            }
        }

-        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
-                   ? 0
-                   : 1;
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
    }

    return 0;

--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+add_custom_target(example_grouped_conv_fwd_multiple_d)
+
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
+
+  add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
+endif() # USE_BITINT_EXTENSION_INT4
+
+if(GPU_TARGETS MATCHES "gfx1100")
+  add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp)
+endif()
+
+add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
--- a/example/30_grouped_conv_fwd_multiple_d/README.md
+++ b/example/30_grouped_conv_fwd_multiple_d/README.md
+Command
+```bash
+arg1: verification (0=no, 1=yes)
+arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+arg3: time kernel (0=no, 1=yes)
+Following arguments (depending on number of spatial dims):
+ Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
+ G, N, K, C,
+ <filter spatial dimensions>, (ie Y, X for 2D)
+ <input image spatial dimensions>, (ie Hi, Wi for 2D)
+ <strides>, (ie Sy, Sx for 2D)
+ <dilations>, (ie Dy, Dx for 2D)
+ <left padding>, (ie LeftPy, LeftPx for 2D)
+ <right padding>, (ie RightPy, RightPx for 2D)
+
+./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
+```
+
+Result (MI100)
+```
+in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
+wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
+bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
+```
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
+#pragma once

-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-
-// kernel data types
-using InKernelDataType       = int8_t;
-using WeiKernelDataType      = int8_t;
-using AccDataType            = int32_t;
-using CShuffleDataType       = int8_t;
-using BiasKernelDataType     = int8_t;
-using ResidualKernelDataType = int8_t;
-using OutKernelDataType      = int8_t;
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <string>
+#include <type_traits>

-// tensor data types
-using InUserDataType  = ck::int4_t;
-using WeiUserDataType = ck::int4_t;
-using OutUserDataType = ck::int4_t;
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using BF16 = ck::bhalf_t;
+using FP16 = ck::half_t;
+using FP32 = float;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using I4 = ck::int4_t;
+#endif
+using I8  = std::int8_t;
+using I32 = std::int32_t;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

-template <ck::index_t NDimSpatial,
-          typename InLayout,
-          typename WeiLayout,
-          typename BiasLayout,
-          typename ResidualLayout,
-          typename OutLayout>
-using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-        NDimSpatial,
-        InLayout,
-        WeiLayout,
-        ck::Tuple<BiasLayout, ResidualLayout>,
-        OutLayout,
-        InKernelDataType,
-        WeiKernelDataType,
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
-        OutKernelDataType,
-        InElementOp,
-        WeiElementOp,
-        OutElementOp,
-        ConvSpec,    // ConvForwardSpecialization
-        GemmSpec,    // GemmSpecialization
-        1,           //
-        256,         // BlockSize
-        128,         // MPerBlock
-        256,         // NPerBlock
-        64,          // KPerBlock
-        16,          // AK1
-        16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
-        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-        2,           // ABlockTransferSrcVectorDim
-        16,          // ABlockTransferSrcScalarPerVector
-        16,          // ABlockTransferDstScalarPerVector_AK1
-        1,           // ABlockLdsExtraM
-        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-        2,           // BBlockTransferSrcVectorDim
-        16,          // BBlockTransferSrcScalarPerVector
-        16,          // BBlockTransferDstScalarPerVector_BK1
-        1,           // BBlockLdsExtraN
-        1,
-        1,
-        S<1, 64, 1, 4>,
-        16>;
-
-int main(int argc, char* argv[])
+template <typename InputLay, typename WeightLay, typename OutputLay>
+struct CommonLayoutSetting
+{
+    using InputLayout  = InputLay;
+    using WeightLayout = WeightLay;
+    using OutputLayout = OutputLay;
+};
+
+template <ck::index_t NDimSpatial>
+struct CommonLayoutSettingSelector;
+
+namespace ctl = ck::tensor_layout::convolution;
+
+template <>
+struct CommonLayoutSettingSelector<1> final
+    : CommonLayoutSetting<ctl::G_NW_C, ctl::G_K_X_C, ctl::G_NW_K>
 {
-    namespace ctc = ck::tensor_layout::convolution;
+};

-    print_helper_msg();
+template <>
+struct CommonLayoutSettingSelector<2> final
+    : CommonLayoutSetting<ctl::G_NHW_C, ctl::G_K_YX_C, ctl::G_NHW_K>
+{
+};

+template <>
+struct CommonLayoutSettingSelector<3> final
+    : CommonLayoutSetting<ctl::G_NDHW_C, ctl::G_K_ZYX_C, ctl::G_NDHW_K>
+{
+};
+
+template <ck::index_t NDimSpatial>
+using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;
+
+template <ck::index_t NDimSpatial>
+using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;
+
+template <ck::index_t NDimSpatial>
+using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;
+
+struct ExecutionConfig final
+{
    bool do_verification = true;
    int init_method      = 1;
-    bool time_kernel     = false;
-
-    // conventional group conv definition
-    // G = 2
-    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
-    // [K, C,  Y,  X] =  [512, 192,  3,  3]
-    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
-    // CK group conv definition
-    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
-    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
-    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
-    ck::utils::conv::ConvParam conv_param{
-        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    bool time_kernel     = true;
+};
+
+#define DefaultConvParam                                                       \
+    ck::utils::conv::ConvParam                                                 \
+    {                                                                          \
+        2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
+    }
+
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck::utils::conv::ConvParam& conv_param)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;

    if(argc == 1)
    {
        // use default
    }
-    else if(argc == 4)
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
    }
-    else
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
    {
-        do_verification                   = std::stoi(argv[1]);
-        init_method                       = std::stoi(argv[2]);
-        time_kernel                       = std::stoi(argv[3]);
-        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);

-        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param                        = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
    }

-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
+    return true;
+}

-    if(conv_param.num_dim_spatial_ == 1)
+inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
    {
-        using InLayout       = ctc::G_NW_C;
-        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NW_K;
-        using OutLayout      = ctc::G_NW_K;
-
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+    case 1:
+        return HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
@@ -150,81 +162,8 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.C_                                         // wi
            });

-        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
-            {
-                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
-                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
-                1,                                                                     // c
-                conv_param.C_                                                          // x
-            });
-
-        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_, // g
-                0,             // k
-                1,             // c
-                0              // x
-            });
-
-        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_, // g
-                0,             // k
-                1,             // c
-                0              // x
-            });
-
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_,                                                         // g
-                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
-                1,                                                                     // k
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-
-        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<1,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    else if(conv_param.num_dim_spatial_ == 2)
-    {
-        using InLayout       = ctc::G_NHW_C;
-        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NHW_K;
-        using OutLayout      = ctc::G_NHW_K;
-
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+    case 2:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
@@ -239,104 +178,8 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.C_                                         // wi
            });

-        const auto wei_g_k_c_xs_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.K_,
-                                  conv_param.C_,
-                                  conv_param.filter_spatial_lengths_[0],
-                                  conv_param.filter_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
-                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
-                                     conv_param.filter_spatial_lengths_[0] *
-                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
-                                     1,                                                         // c
-                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
-                                     conv_param.C_                                              // x
-                                 });
-
-        const auto bias_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // ho
-                                     0              // wo
-                                 });
-
-        const auto residual_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // ho
-                                     0              // wo
-                                 });
-
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_,
-             conv_param.N_,
-             conv_param.K_,
-             conv_param.output_spatial_lengths_[0],
-             conv_param.output_spatial_lengths_[1]},
-            {
-                conv_param.K_, // g
-                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
-                    conv_param.G_ * conv_param.K_,                                     // n
-                1,                                                                     // k
-                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-
-        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<2,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    else if(conv_param.num_dim_spatial_ == 3)
-    {
-        using InLayout       = ctc::G_NDHW_C;
-        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NDHW_K;
-        using OutLayout      = ctc::G_NDHW_K;
-
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+    case 3:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
@@ -353,8 +196,42 @@ int main(int argc, char* argv[])
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
+    }

-        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.C_,                                     // k
+                1,                                                     // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+    case 3:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
@@ -373,40 +250,89 @@ int main(int argc, char* argv[])
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
+    }

-        const auto bias_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1],
-                                  conv_param.output_spatial_lengths_[2]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // z
-                                     0,             // y
-                                     0              // x
-                                 });
-
-        const auto residual_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1],
-                                  conv_param.output_spatial_lengths_[2]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // z
-                                     0,             // y
-                                     0              // x
-                                 });
-
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+    case 2:
+        return HostTensorDescriptor({conv_param.G_,
+                                     conv_param.N_,
+                                     conv_param.K_,
+                                     conv_param.output_spatial_lengths_[0],
+                                     conv_param.output_spatial_lengths_[1]},
+                                    {
+                                        conv_param.K_, // g
+                                        0,             // n
+                                        1,             // k
+                                        0,             // ho
+                                        0              // wo
+                                    });
+    case 3:
+        return HostTensorDescriptor({conv_param.G_,
+                                     conv_param.N_,
+                                     conv_param.K_,
+                                     conv_param.output_spatial_lengths_[0],
+                                     conv_param.output_spatial_lengths_[1],
+                                     conv_param.output_spatial_lengths_[2]},
+                                    {
+                                        conv_param.K_, // g
+                                        0,             // n
+                                        1,             // k
+                                        0,             // z
+                                        0,             // y
+                                        0              // x
+                                    });
+    }
+
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+    case 3:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
@@ -423,37 +349,7 @@ int main(int argc, char* argv[])
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
-
-        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<3,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
    }

-    return 0;
+    throw std::runtime_error("unsuppored # dim spatial");
 }
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
-
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-
-// kernel data types
-using InKernelDataType       = float;
-using WeiKernelDataType      = float;
-using AccDataType            = float;
-using CShuffleDataType       = float;
-using BiasKernelDataType     = float;
-using ResidualKernelDataType = float;
-using OutKernelDataType      = float;
-
-// tensor data types
-using InUserDataType  = InKernelDataType;
-using WeiUserDataType = WeiKernelDataType;
-using OutUserDataType = OutKernelDataType;
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <string>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using BF16 = ck::bhalf_t;
+using FP16 = ck::half_t;
+using FP32 = float;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using I4 = ck::int4_t;
+#endif
+using I8  = std::int8_t;
+using I32 = std::int32_t;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

-template <ck::index_t NDimSpatial,
-          typename InLayout,
-          typename WeiLayout,
-          typename BiasLayout,
-          typename ResidualLayout,
-          typename OutLayout>
-using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-        NDimSpatial,
-        InLayout,
-        WeiLayout,
-        ck::Tuple<BiasLayout, ResidualLayout>,
-        OutLayout,
-        InKernelDataType,
-        WeiKernelDataType,
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
-        OutKernelDataType,
-        InElementOp,
-        WeiElementOp,
-        OutElementOp,
-        ConvSpec,    // ConvForwardSpecialization
-        GemmSpec,    // GemmSpecialization
-        1,           //
-        256,         // BlockSize
-        128,         // MPerBlock
-        256,         // NPerBlock
-        16,          // KPerBlock
-        4,           // AK1
-        4,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
-        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-        2,           // ABlockTransferSrcVectorDim
-        4,           // ABlockTransferSrcScalarPerVector
-        4,           // ABlockTransferDstScalarPerVector_AK1
-        1,           // ABlockLdsExtraM
-        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-        2,           // BBlockTransferSrcVectorDim
-        4,           // BBlockTransferSrcScalarPerVector
-        4,           // BBlockTransferDstScalarPerVector_BK1
-        1,           // BBlockLdsExtraN
-        1,
-        1,
-        S<1, 16, 1, 16>,
-        4>;
-
-int main(int argc, char* argv[])
+template <typename InputLay, typename WeightLay, typename OutputLay>
+struct CommonLayoutSetting
+{
+    using InputLayout  = InputLay;
+    using WeightLayout = WeightLay;
+    using OutputLayout = OutputLay;
+};
+
+template <ck::index_t NDimSpatial>
+struct CommonLayoutSettingSelector;
+
+namespace ctl = ck::tensor_layout::convolution;
+
+template <>
+struct CommonLayoutSettingSelector<1> final
+    : CommonLayoutSetting<ctl::G_NW_C, ctl::G_K_X_C, ctl::G_NW_K>
 {
-    namespace ctc = ck::tensor_layout::convolution;
+};

-    print_helper_msg();
+template <>
+struct CommonLayoutSettingSelector<2> final
+    : CommonLayoutSetting<ctl::G_NHW_C, ctl::G_K_YX_C, ctl::G_NHW_K>
+{
+};

+template <>
+struct CommonLayoutSettingSelector<3> final
+    : CommonLayoutSetting<ctl::G_NDHW_C, ctl::G_K_ZYX_C, ctl::G_NDHW_K>
+{
+};
+
+template <ck::index_t NDimSpatial>
+using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;
+
+template <ck::index_t NDimSpatial>
+using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;
+
+template <ck::index_t NDimSpatial>
+using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;
+
+struct ExecutionConfig final
+{
    bool do_verification = true;
    int init_method      = 1;
-    bool time_kernel     = false;
-
-    // conventional group conv definition
-    // G = 2
-    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
-    // [K, C,  Y,  X] =  [512, 192,  3,  3]
-    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
-    // CK group conv definition
-    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
-    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
-    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
-    ck::utils::conv::ConvParam conv_param{
-        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    bool time_kernel     = true;
+};
+
+#define DefaultConvParam                                                       \
+    ck::utils::conv::ConvParam                                                 \
+    {                                                                          \
+        2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
+    }
+
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck::utils::conv::ConvParam& conv_param)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;

    if(argc == 1)
    {
        // use default
    }
-    else if(argc == 4)
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
    }
-    else
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
    {
-        do_verification                   = std::stoi(argv[1]);
-        init_method                       = std::stoi(argv[2]);
-        time_kernel                       = std::stoi(argv[3]);
-        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);

-        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param                        = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
    }

-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
+    return true;
+}

-    if(conv_param.num_dim_spatial_ == 1)
+inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
    {
-        using InLayout       = ctc::G_NW_C;
-        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NW_K;
-        using OutLayout      = ctc::G_NW_K;
-
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+    case 1:
+        return HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
@@ -150,81 +162,8 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.C_                                         // wi
            });

-        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
-            {
-                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
-                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
-                1,                                                                     // c
-                conv_param.C_                                                          // x
-            });
-
-        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_, // g
-                0,             // k
-                1,             // c
-                0              // x
-            });
-
-        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_, // g
-                0,             // k
-                1,             // c
-                0              // x
-            });
-
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_,                                                         // g
-                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
-                1,                                                                     // k
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-
-        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<1,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    else if(conv_param.num_dim_spatial_ == 2)
-    {
-        using InLayout       = ctc::G_NHW_C;
-        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NHW_K;
-        using OutLayout      = ctc::G_NHW_K;
-
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+    case 2:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
@@ -239,104 +178,8 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.C_                                         // wi
            });

-        const auto wei_g_k_c_xs_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.K_,
-                                  conv_param.C_,
-                                  conv_param.filter_spatial_lengths_[0],
-                                  conv_param.filter_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
-                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
-                                     conv_param.filter_spatial_lengths_[0] *
-                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
-                                     1,                                                         // c
-                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
-                                     conv_param.C_                                              // x
-                                 });
-
-        const auto bias_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // ho
-                                     0              // wo
-                                 });
-
-        const auto residual_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // ho
-                                     0              // wo
-                                 });
-
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_,
-             conv_param.N_,
-             conv_param.K_,
-             conv_param.output_spatial_lengths_[0],
-             conv_param.output_spatial_lengths_[1]},
-            {
-                conv_param.K_, // g
-                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
-                    conv_param.G_ * conv_param.K_,                                     // n
-                1,                                                                     // k
-                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-
-        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<2,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    else if(conv_param.num_dim_spatial_ == 3)
-    {
-        using InLayout       = ctc::G_NDHW_C;
-        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NDHW_K;
-        using OutLayout      = ctc::G_NDHW_K;
-
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+    case 3:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
@@ -353,8 +196,42 @@ int main(int argc, char* argv[])
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
+    }

-        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.C_,                                     // k
+                1,                                                     // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+    case 3:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
@@ -373,40 +250,89 @@ int main(int argc, char* argv[])
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
+    }

-        const auto bias_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1],
-                                  conv_param.output_spatial_lengths_[2]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // z
-                                     0,             // y
-                                     0              // x
-                                 });
-
-        const auto residual_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1],
-                                  conv_param.output_spatial_lengths_[2]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // z
-                                     0,             // y
-                                     0              // x
-                                 });
-
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+    case 2:
+        return HostTensorDescriptor({conv_param.G_,
+                                     conv_param.N_,
+                                     conv_param.K_,
+                                     conv_param.output_spatial_lengths_[0],
+                                     conv_param.output_spatial_lengths_[1]},
+                                    {
+                                        conv_param.K_, // g
+                                        0,             // n
+                                        1,             // k
+                                        0,             // ho
+                                        0              // wo
+                                    });
+    case 3:
+        return HostTensorDescriptor({conv_param.G_,
+                                     conv_param.N_,
+                                     conv_param.K_,
+                                     conv_param.output_spatial_lengths_[0],
+                                     conv_param.output_spatial_lengths_[1],
+                                     conv_param.output_spatial_lengths_[2]},
+                                    {
+                                        conv_param.K_, // g
+                                        0,             // n
+                                        1,             // k
+                                        0,             // z
+                                        0,             // y
+                                        0              // x
+                                    });
+    }
+
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+    case 3:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
@@ -423,37 +349,7 @@ int main(int argc, char* argv[])
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
-
-        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<3,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
    }

-    return 0;
+    throw std::runtime_error("unsuppored # dim spatial");
 }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common_wmma.hpp"
+
+// kernel data types
+using InKernelDataType       = FP16;
+using WeiKernelDataType      = FP16;
+using AccDataType            = FP32;
+using CShuffleDataType       = FP16;
+using BiasKernelDataType     = FP16;
+using ResidualKernelDataType = FP16;
+using OutKernelDataType      = FP16;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+// kernel data types
+using InKernelDataType       = BF16;
+using WeiKernelDataType      = BF16;
+using AccDataType            = FP32;
+using CShuffleDataType       = FP32;
+using BiasKernelDataType     = BF16;
+using ResidualKernelDataType = BF16;
+using OutKernelDataType      = BF16;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+// kernel data types
+using InKernelDataType       = FP16;
+using WeiKernelDataType      = FP16;
+using AccDataType            = FP32;
+using CShuffleDataType       = FP16;
+using BiasKernelDataType     = FP16;
+using ResidualKernelDataType = FP16;
+using OutKernelDataType      = FP16;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }