Merge branch 'develop' into sphinx_doc

e7be2fe8 · pmaybank · GitHub · f68fa79a · f7d28f3e · e7be2fe8
Unverified Commit e7be2fe8 authored Feb 10, 2023 by pmaybank Committed by GitHub Feb 10, 2023
20 changed files
--- a/example/20_convnd_bwd_weight/convnd_bwd_weight_common.hpp
+++ b/example/20_convnd_bwd_weight/convnd_bwd_weight_common.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <iostream>
+template <ck::index_t NDimSpatial>
-#include <numeric>
+using DeviceConvBwdWeightInstance =
-#include <initializer_list>
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<
-#include <cstdlib>
+        NDimSpatial,          // NDimSpatial
+        InDataType,           // InDataType
-#include "ck/ck.hpp"
+        WeiDataType,          // WeiDataType
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+        OutDataType,          // OutDataType
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+        AccDataType,          // AccDataType
+        InElementOp,          // InElementwiseOperation
-#include "ck/library/utility/check_err.hpp"
+        WeiElementOp,         // WeiElementwiseOperation
-#include "ck/library/utility/device_memory.hpp"
+        OutElementOp,         // OutElementwiseOperation
-#include "ck/library/utility/host_tensor.hpp"
+        ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
-#include "ck/library/utility/host_tensor_generator.hpp"
+        256,                  // BlockSize
-#include "ck/library/utility/convolution_parameter.hpp"
+        128,                  // MPerBlock
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+        128,                  // NPerBlock
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
+        4,                    // K0PerBlock
+        8,                    // K1
-void print_helper_msg()
+        32,                   // MPerXdl
+        32,                   // NPerXdl
+        2,                    // MXdlPerWave
+        2,                    // NXdlPerWave
+        S<1, 4, 16, 4>,       // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<0, 3, 1, 2>,        // ABlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1, 3>,        // ABlockTransferSrcAccessOrder
+        2,                    // ABlockTransferSrcVectorDim
+        8,                    // ABlockTransferSrcScalarPerVector
+        2,                    // ABlockTransferDstScalarPerVector_K1
+        true,                 // ABlockLdsAddExtraM
+        S<1, 4, 16, 4>,       // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<0, 3, 1, 2>,        // BBlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1, 3>,        // BBlockTransferSrcAccessOrder
+        2,                    // BBlockTransferSrcVectorDim
+        8,                    // BBlockTransferSrcScalarPerVector
+        2,                    // BBlockTransferDstScalarPerVector_K1
+        true,                 // BBlockLdsAddExtraN
+        1,                    // CShuffleMXdlPerWavePerShuffle
+        1,                    // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
+template <ck::index_t NDimSpatial>
+using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+                                                                                     InDataType,
+                                                                                     WeiDataType,
+                                                                                     OutDataType,
+                                                                                     InElementOp,
+                                                                                     WeiElementOp,
+                                                                                     OutElementOp>;
+template <ck::index_t NDimSpatial>
+bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
+                                 const ck::utils::conv::ConvParam& conv_param)
 {
-    std::cout << "arg1: verification (0=no, 1=yes)\n"
+    constexpr ck::index_t split_k = 2;
-              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: time kernel (0=no, 1=yes)\n"
+    const auto in_g_n_c_wis_desc =
-              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
-}
+            InputLayout<NDimSpatial>>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
+            WeightLayout<NDimSpatial>>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
+            OutputLayout<NDimSpatial>>(conv_param);
-template <ck::index_t NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp,
-          typename DeviceConvBwdWeightInstance>
-int run_conv_bwd_weight(bool do_verification,
-                        int init_method,
-                        bool time_kernel,
-                        const ck::utils::conv::ConvParam& conv_param,
-                        const HostTensorDescriptor& in_g_n_c_wis_desc,
-                        const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                        const HostTensorDescriptor& out_g_n_k_wos_desc,
-                        const InElementOp& in_element_op,
-                        const WeiElementOp& wei_element_op,
-                        const OutElementOp& out_element_op,
-                        ck::index_t split_k)
-{
    Tensor<InDataType> in(in_g_n_c_wis_desc);
    Tensor<WeiDataType> wei_host_result(wei_g_k_c_xs_desc);
    Tensor<WeiDataType> wei_device_result(wei_g_k_c_xs_desc);
@@ -55,7 +77,7 @@ int run_conv_bwd_weight(bool do_verification,
    std::cout << "wei: " << wei_host_result.mDesc << std::endl;
    std::cout << "out: " << out.mDesc << std::endl;
-    switch(init_method)
+    switch(config.init_method)
    {
    case 0: break;
    case 1:
@@ -77,36 +99,55 @@ int run_conv_bwd_weight(bool do_verification,
    // init to 0
    wei_device_buf.SetZero();
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
+    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
+    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
+    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
+    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
+    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
+    range_copy(conv_param.input_right_pads_, begin(input_right_pads));
    // do GEMM
-    auto conv     = DeviceConvBwdWeightInstance{};
+    auto conv     = DeviceConvBwdWeightInstance<NDimSpatial>{};
    auto invoker  = conv.MakeInvoker();
    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      conv_param.G_,
                                      conv_param.N_,
                                      conv_param.K_,
                                      conv_param.C_,
-                                      conv_param.input_spatial_lengths_,
+                                      input_spatial_lengths,
-                                      conv_param.filter_spatial_lengths_,
+                                      filter_spatial_lengths,
-                                      conv_param.output_spatial_lengths_,
+                                      output_spatial_lengths,
-                                      conv_param.conv_filter_strides_,
+                                      conv_filter_strides,
-                                      conv_param.conv_filter_dilations_,
+                                      conv_filter_dilations,
-                                      conv_param.input_left_pads_,
+                                      input_left_pads,
-                                      conv_param.input_right_pads_,
+                                      input_right_pads,
-                                      in_element_op,
+                                      InElementOp{},
-                                      wei_element_op,
+                                      WeiElementOp{},
-                                      out_element_op,
+                                      OutElementOp{},
                                      split_k);
    if(!conv.IsSupportedArgument(argument))
    {
-        std::cout << "wrong! device_conv with the specified compilation parameters does "
+        std::cerr << "wrong! device_conv with the specified compilation parameters does "
                     "not support this Conv problem"
                  << std::endl;
-        return 1;
+        return false;
    }
-    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
    std::size_t flop      = conv_param.GetFlops();
    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
@@ -115,21 +156,14 @@ int run_conv_bwd_weight(bool do_verification,
    float gb_per_sec = num_btype / 1.E6 / avg_time;
-    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+    std::cerr << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << conv.GetTypeString() << std::endl;
+              << std::endl
+              << "DeviceOp: " << conv.GetTypeString() << std::endl;
-    if(do_verification)
+    if(config.do_verification)
    {
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+        auto ref_conv     = HostConvBwdWeightInstance<NDimSpatial>{};
-                                                                           InDataType,
+        auto ref_invoker  = ref_conv.MakeInvoker();
-                                                                           WeiDataType,
-                                                                           OutDataType,
-                                                                           InElementOp,
-                                                                           WeiElementOp,
-                                                                           OutElementOp>{};
-        auto ref_invoker = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(in,
                                                  wei_host_result,
                                                  out,
@@ -145,8 +179,28 @@ int run_conv_bwd_weight(bool do_verification,
        wei_device_buf.FromDevice(wei_device_result.mData.data());
-        return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData);
+    }
+    return true;
+}
+bool run_grouped_conv_bwd_weight_example(int argc, char* argv[])
+{
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_param = DefaultConvParam;
+    if(!parse_cmd_args(argc, argv, config, conv_param))
+    {
+        return false;
+    }
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1: return run_grouped_conv_bwd_weight<1>(config, conv_param);
+    case 2: return run_grouped_conv_bwd_weight<2>(config, conv_param);
+    case 3: return run_grouped_conv_bwd_weight<3>(config, conv_param);
    }
-    return 0;
+    return false;
 }
--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
-add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_fp16 gemm_bias_relu_add_layernorm_xdl_fp16.cpp)
+add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
-add_example_executable(example_gemm_layernorm_xdl_fp16 gemm_layernorm_xdl_fp16.cpp)
+add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
-add_example_executable(example_gemm_xdl_layernorm_single_kernel_fp16 gemm_xdl_layernorm_single_kernel_fp16.cpp)
+add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
+add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
@@ -4,18 +4,18 @@
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -94,7 +94,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
 using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
 // A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
-using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
    ck::Tuple<EDataType,
              R0DataType,
              R1DataType,
@@ -108,21 +108,20 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
    ck::Sequence<8>>;            // scalarPerVector: y(layerNorm_out)
 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
-    return HostTensorDescriptor(std::vector<std::size_t>({len}),
+    return HostTensorDescriptor({len}, {stride});
-                                std::vector<std::size_t>({stride}));
 };
 auto f_host_tensor_descriptor2d =
    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        using namespace ck::literals;
+        if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
-            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+            return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                        std::vector<std::size_t>({stride, 1}));
        }
        else
        {
-            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+            return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                        std::vector<std::size_t>({1, stride}));
        }
    };
@@ -372,8 +371,8 @@ int main()
                            N);
        layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data());
-        pass &= ck::utils::check_err(layerNorm_m_n.mData,
+        pass &= ck::utils::check_err(layerNorm_m_n,
-                                     host_layerNorm_m_n.mData,
+                                     host_layerNorm_m_n,
                                     "Error: Incorrect results layerNorm_m_n",
                                     1e-2,
                                     1e-2);

--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+#include "ck/library/utility/check_err.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddReluAdd  = ck::tensor_operation::element_wise::AddReluAdd;
+// DataType
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F16;
+using D1DataType       = F16;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EMeanVarDataType = F16;
+using GammaDataType    = F16;
+using BetaDataType     = F16;
+using HDataType        = F16;
+// Layout
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using HLayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddReluAdd;
+using HElementOp   = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| HLayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EMeanVarData|     GammaData|     BetaData|     HData|           A|           B|          CDE|            H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|           PostShuffle|     PostShuffle|            Layernorm|       Layernorm|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|             Type|          Type|         Type|      Type| Elementwise| Elementwise|  Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|  ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|
+//######|        |        |         |        |          |          |            |                 |           |                 |              |             |          |   Operation|   Operation|    Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                  _M_N|            _M_N|                 _M_N|              _M|
+//######|        |        |         |        |          |          |            |                 |           |                 |              |             |          |            |            |             |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                      |                |                     |                |
+        < ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType,  AElementOp,  BElementOp, CDEElementOp,   HElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<32, 8>,               8,             S<8, 32>,               8>;
+// clang-format on
+auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+    return HostTensorDescriptor({len}, {stride});
+};
+auto f_host_tensor_descriptor2d =
+    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        using namespace ck::literals;
+        if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({row, col}, {stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({row, col}, {1_uz, stride});
+        }
+    };
+void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
+                         const Tensor<ADataType>& a_m_k,
+                         const Tensor<BDataType>& b_k_n,
+                         const Tensor<D0DataType>& bias_n,
+                         const Tensor<D1DataType>& d1_m_n,
+                         const Tensor<GammaDataType>& gamma_n,
+                         const Tensor<BetaDataType>& beta_n,
+                         AElementOp a_element_op,
+                         BElementOp b_element_op,
+                         CDEElementOp cde_element_op,
+                         HElementOp h_element_op,
+                         int M,
+                         int N,
+                         AccDataType epsilon = 1e-5)
+{
+    using ReferenceGemm = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                    BDataType,
+                                                                    AccDataType,
+                                                                    AccDataType,
+                                                                    AElementOp,
+                                                                    BElementOp,
+                                                                    PassThrough>;
+    using ReferenceLayernorm = ck::tensor_operation::host::ReferenceLayernorm<EMeanVarDataType,
+                                                                              GammaDataType,
+                                                                              BetaDataType,
+                                                                              HDataType,
+                                                                              AccDataType,
+                                                                              HElementOp,
+                                                                              2,
+                                                                              1>;
+    Tensor<EMeanVarDataType> e_m_n(HostTensorDescriptor{M, N});
+    Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
+    auto ref_gemm         = ReferenceGemm{};
+    auto ref_gemm_invoker = ref_gemm.MakeInvoker();
+    auto ref_gemm_argument =
+        ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+    ref_gemm_invoker.Run(ref_gemm_argument);
+    for(int n = 0; n < N; ++n)
+    {
+        AccDataType bias = static_cast<AccDataType>(bias_n(n));
+        for(int m = 0; m < M; ++m)
+        {
+            AccDataType e  = static_cast<AccDataType>(e_m_n(m, n));
+            AccDataType d1 = static_cast<AccDataType>(d1_m_n(m, n));
+            cde_element_op(e, c_m_n(m, n), bias, d1);
+            e_m_n(m, n) = static_cast<EMeanVarDataType>(e);
+        }
+    }
+    ReferenceLayernorm ref_layernorm;
+    auto ref_layernorm_invoker = ref_layernorm.MakeInvoker();
+    auto ref_layernorm_argument = ref_layernorm.MakeArgument(
+        e_m_n, gamma_n, beta_n, h_m_n, h_element_op, {M, N}, {1}, epsilon);
+    ref_layernorm_invoker.Run(ref_layernorm_argument);
+}
+int main()
+{
+    bool do_verification = true;
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+    ck::index_t StrideA  = K;
+    ck::index_t StrideB  = K;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = N;
+    ck::index_t StrideH  = N;
+    float epsilon = 1e-5;
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor2d(M, N, StrideD1, D1Layout{}));
+    Tensor<GammaDataType> gamma_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<HDataType> h_m_n(f_host_tensor_descriptor2d(M, N, StrideH, HLayout{}));
+    a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
+    b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
+    d0_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-1, 1});
+    d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-1, 1});
+    gamma_n.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-1, 1});
+    beta_n.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-1, 1});
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_n.mDesc.GetElementSpaceSize());
+    DeviceMem beta_device_buf(sizeof(BetaDataType) * beta_n.mDesc.GetElementSpaceSize());
+    DeviceMem h_device_buf(sizeof(HDataType) * h_m_n.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    gamma_device_buf.ToDevice(gamma_n.mData.data());
+    beta_device_buf.ToDevice(beta_n.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto h_element_op   = HElementOp{};
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               gamma_device_buf.GetDeviceBuffer(),
+                               beta_device_buf.GetDeviceBuffer(),
+                               h_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideH,
+                               epsilon,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op,
+                               h_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+    size_t workspace_sz = device_op.GetWorkSpaceSize(&argument);
+    DeviceMem workspace_dev(workspace_sz);
+    device_op.SetWorkSpacePointer(&argument, workspace_dev.GetDeviceBuffer());
+    invoker.Run(argument, StreamConfig{nullptr, false});
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<HDataType> h_m_n_host(HostTensorDescriptor{M, N});
+        host_gemm_layernorm(h_m_n_host,
+                            a_m_k,
+                            b_k_n,
+                            d0_n,
+                            d1_m_n,
+                            gamma_n,
+                            beta_n,
+                            a_element_op,
+                            b_element_op,
+                            cde_element_op,
+                            h_element_op,
+                            M,
+                            N,
+                            epsilon);
+        h_device_buf.FromDevice(h_m_n.mData.data());
+        pass &=
+            ck::utils::check_err(h_m_n, h_m_n_host, "Error: Incorrect results h_m_n", 1e-2, 1e-2);
+    }
+    return pass ? 0 : 1;
+}
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
@@ -4,18 +4,18 @@
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -91,7 +91,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
 using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
 // A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
-using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
    ck::Tuple<EDataType,
              R0DataType,
              R1DataType,
@@ -107,21 +107,20 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
    ck::Sequence<8>>;            // scalarPerVector: y(layerNorm_out)
 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
-    return HostTensorDescriptor(std::vector<std::size_t>({len}),
+    return HostTensorDescriptor({len}, {stride});
-                                std::vector<std::size_t>({stride}));
 };
 auto f_host_tensor_descriptor2d =
    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        using namespace ck::literals;
+        if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
-            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+            return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                        std::vector<std::size_t>({stride, 1}));
        }
        else
        {
-            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+            return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                        std::vector<std::size_t>({1, stride}));
        }
    };
@@ -346,11 +345,8 @@ int main()
                            N);
        layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data());
-        pass &= ck::utils::check_err(layerNorm_m_n.mData,
+        pass &= ck::utils::check_err(
-                                     host_layerNorm_m_n.mData,
+            layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3);
-                                     "Error: Incorrect results d1",
-                                     1e-3,
-                                     1e-3);
    }
    {

--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
@@ -10,6 +10,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -132,15 +133,15 @@ int main(int argc, char* argv[])
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            using namespace ck::literals;
+            if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -149,10 +150,10 @@ int main(int argc, char* argv[])
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<AccDataType> acc_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<C0DataType> c0_n_bias(HostTensorDescriptor(std::vector<size_t>({size_t(N)})));
+    Tensor<C0DataType> c0_n_bias({N});
    Tensor<C0DataType> c0_m_n_add(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<C0DataType> c0_n_gamma(HostTensorDescriptor(std::vector<size_t>({size_t(N)})));
+    Tensor<C0DataType> c0_n_gamma({N});
-    Tensor<C0DataType> c0_n_beta(HostTensorDescriptor(std::vector<size_t>({size_t(N)})));
+    Tensor<C0DataType> c0_n_beta({N});
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -274,15 +275,12 @@ int main(int argc, char* argv[])
        if constexpr(std::is_same<CShuffleDataType, F32>::value)
        {
            pass &= ck::utils::check_err(
-                c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c");
+                c_m_n_device_result, c_m_n_host_result, "Error: Incorrect results c");
        }
        else if constexpr(std::is_same<CShuffleDataType, F16>::value)
        {
-            pass &= ck::utils::check_err(c_m_n_device_result.mData,
+            pass &= ck::utils::check_err(
-                                         c_m_n_host_result.mData,
+                c_m_n_device_result, c_m_n_host_result, "Error: Incorrect results c", 1e-2, 1e-2);
-                                         "Error: Incorrect results c",
-                                         1e-2,
-                                         1e-2);
        }
    }
    return pass ? 0 : 1;

--- a/example/22_cgemm/cgemm_xdl_common.hpp
+++ b/example/22_cgemm/cgemm_xdl_common.hpp
@@ -11,6 +11,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 template <ck::index_t... Is>
@@ -62,15 +63,15 @@ bool run_cgemm_xdl(ck::index_t M,
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -219,14 +220,14 @@ bool run_cgemm_xdl(ck::index_t M,
            const Tensor<CDataType> c_m_n_real_device_result_converted(c_m_n_real_device_result);
            const Tensor<CDataType> c_m_n_imag_device_result_converted(c_m_n_imag_device_result);
-            result = ck::utils::check_err(c_m_n_real_device_result_converted.mData,
+            result = ck::utils::check_err(c_m_n_real_device_result_converted,
-                                          c_m_n_real_host_result.mData,
+                                          c_m_n_real_host_result,
                                          "Verification error: incorrect results in real part!",
                                          1e-2f,
                                          1e-1f);
            result = result && ck::utils::check_err(
-                                   c_m_n_imag_device_result_converted.mData,
+                                   c_m_n_imag_device_result_converted,
-                                   c_m_n_imag_host_result.mData,
+                                   c_m_n_imag_host_result,
                                   "Verification error: incorrect results in imaginary part!",
                                   1e-2f,
                                   1e-1f);
@@ -234,14 +235,14 @@ bool run_cgemm_xdl(ck::index_t M,
        else
 #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
        {
-            result = ck::utils::check_err(c_m_n_real_device_result.mData,
+            result = ck::utils::check_err(c_m_n_real_device_result,
-                                          c_m_n_real_host_result.mData,
+                                          c_m_n_real_host_result,
                                          "Verification error: incorrect results in real part!",
                                          1e-2f,
                                          1e-1f);
            result = result && ck::utils::check_err(
-                                   c_m_n_imag_device_result.mData,
+                                   c_m_n_imag_device_result,
-                                   c_m_n_imag_host_result.mData,
+                                   c_m_n_imag_host_result,
                                   "Verification error: incorrect results in imaginary part!",
                                   1e-2f,
                                   1e-1f);

--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -56,8 +56,8 @@ class SimpleAppArgs
    int option_index = 0;
    public:
-    std::vector<size_t> inLengths   = {8, 128, 2048};
+    std::vector<size_t> inLengths = {8, 128, 2048};
-    std::vector<AccDataType> scales = {2.0f, 2.0f};
+    std::vector<double> scales    = {2.0, 2.0};
    bool do_verification = true;
    int init_method      = 2;
@@ -151,8 +151,8 @@ int main(int argc, char* argv[])
    auto inStrides  = in.mDesc.GetStrides();
    auto outStrides = out.mDesc.GetStrides();
-    AccDataType alpha = args.scales[0];
+    double alpha = args.scales[0];
-    AccDataType beta  = args.scales[1];
+    double beta  = args.scales[1];
    std::cout << "in: " << in.mDesc << std::endl;
    std::cout << "out: " << out.mDesc << std::endl;
@@ -221,8 +221,8 @@ int main(int argc, char* argv[])
    auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths,
                                                            i_inStrides,
                                                            reduceDims,
-                                                            &alpha,
+                                                            alpha,
-                                                            &beta,
+                                                            beta,
                                                            in_dev.GetDeviceBuffer(),
                                                            out_dev.GetDeviceBuffer(),
                                                            PassThrough{},
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
        out_dev.FromDevice(out.mData.data());
        // LogRangeAsType<float>(std::cout << "tensor out: " , out.mData, ",") << std::endl;
-        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+        pass = pass && ck::utils::check_err(out, out_ref);
    };
    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});

--- a/example/24_batched_gemm/run_batched_gemm_example.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example.inc
@@ -55,15 +55,15 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
                                       std::size_t stride,
                                       std::size_t batch_stride,
                                       auto layout) {
+        using namespace ck::literals;
        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
-            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
-                                        std::vector<std::size_t>({batch_stride, stride, 1}));
        }
        else
        {
-            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
-                                        std::vector<std::size_t>({batch_stride, 1, stride}));
        }
    };
@@ -174,11 +174,11 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 #ifdef BUILD_INT4_EXAMPLE
        const Tensor<EDataType> e_device_result_converted(e_g_m_n_device_result);
-        pass &= ck::utils::check_err(e_device_result_converted.mData, e_g_m_n_host_result.mData);
+        pass &= ck::utils::check_err(e_device_result_converted, e_g_m_n_host_result);
 #else
        pass = ck::utils::check_err(
-            e_g_m_n_device_result.mData, e_g_m_n_host_result.mData, "Error: Incorrect results c");
+            e_g_m_n_device_result, e_g_m_n_host_result, "Error: Incorrect results c");
 #endif
    }

--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
@@ -246,21 +247,11 @@ int main(int argc, char* argv[])
        exit(0);
    }
-    Tensor<ADataType> a_gs_ms_ks(
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
-        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
-    Tensor<BDataType> b_gs_ns_ks(
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
-    Tensor<DDataType> d_gs_ms_ns(
-        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_host_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_device_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -327,20 +318,14 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    std::size_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
+    std::size_t M = ck::accumulate_n<ck::index_t>(
-                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+        e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    std::size_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+    std::size_t N = ck::accumulate_n<ck::index_t>(
-                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
+        e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    std::size_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
+    std::size_t K = ck::accumulate_n<ck::index_t>(
-                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
+        a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
    std::size_t flop      = std::size_t(2) * M * N * K;
    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
@@ -357,9 +342,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
        using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1<NumDimM,
                                                                     NumDimN,
@@ -407,9 +390,7 @@ int main(int argc, char* argv[])
            }
        }
-        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
-                   ? 0
-                   : 1;
    }
    return 0;

--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -246,21 +247,11 @@ int main(int argc, char* argv[])
        exit(0);
    }
-    Tensor<ADataType> a_gs_ms_ks(
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
-        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
-    Tensor<BDataType> b_gs_ns_ks(
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
-    Tensor<DDataType> d_gs_ms_ns(
-        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_host_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_device_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -327,20 +318,14 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin(),
+    ck::index_t M =
-                                    e_gs_ms_ns_lengths.begin() + NumDimM,
+        ck::accumulate_n<ck::index_t>(e_gs_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimM,
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
-                                    e_gs_ms_ns_lengths.begin() + NumDimM + NumDimN,
+        e_gs_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimM,
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
-                                    a_gs_ms_ks_lengths.begin() + NumDimM + NumDimK,
+        a_gs_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
    std::size_t flop      = std::size_t(2) * M * N * K;
    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
@@ -357,9 +342,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
        using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1<NumDimG,
                                                                     NumDimM,
@@ -408,9 +391,7 @@ int main(int argc, char* argv[])
            }
        }
-        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
-                   ? 0
-                   : 1;
    }
    return 0;

--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -288,21 +289,11 @@ int main(int argc, char* argv[])
        exit(0);
    }
-    Tensor<ADataType> a_ms_ks(
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
-        std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-        std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
+    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
-    Tensor<BDataType> b_ns_ks(
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-        std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
-        std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
-    Tensor<EDataType> d_ms_ns(
-        std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_host_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_device_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
@@ -368,20 +359,14 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
+    ck::index_t M =
-                                    e_ms_ns_lengths.begin() + NumDimM,
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
-                                    e_ms_ns_lengths.begin() + NumDimM + NumDimN,
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
-                                    a_ms_ks_lengths.begin() + NumDimM + NumDimK,
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
    std::size_t flop      = std::size_t(2) * M * N * K;
    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
@@ -398,9 +383,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
                                                                  NumDimN,
@@ -437,7 +420,7 @@ int main(int argc, char* argv[])
            }
        }
-        return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
    }
    return 0;

--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -277,18 +278,10 @@ int main(int argc, char* argv[])
        exit(0);
    }
-    Tensor<ADataType> a_ms_ks(
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
-        std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-        std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-    Tensor<BDataType> b_ns_ks(
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
-        std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
-    Tensor<EDataType> e_ms_ns_host_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_device_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
@@ -349,20 +342,14 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
+    ck::index_t M =
-                                    e_ms_ns_lengths.begin() + NumDimM,
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
-                                    e_ms_ns_lengths.begin() + NumDimM + NumDimN,
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
-                                    a_ms_ks_lengths.begin() + NumDimM + NumDimK,
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_btype =
@@ -379,9 +366,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
                                                                  NumDimN,
@@ -417,7 +402,7 @@ int main(int argc, char* argv[])
            }
        }
-        return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
    }
    return 0;

--- a/example/27_layernorm/CMakeLists.txt
+++ b/example/27_layernorm/CMakeLists.txt
 add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp)
\ No newline at end of file
--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
@@ -17,6 +17,7 @@
 #include "ck/library/utility/host_common_util.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
 using XDataType     = ck::half_t;
@@ -60,13 +61,13 @@ int main()
    ck::index_t Stride = N;
    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
-        return HostTensorDescriptor(std::vector<std::size_t>({len}),
+        return HostTensorDescriptor({len}, {stride});
-                                    std::vector<std::size_t>({stride}));
    };
    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
-        return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+        using namespace ck::literals;
-                                    std::vector<std::size_t>({stride, 1}));
+        return HostTensorDescriptor({row, col}, {stride, 1_uz});
    };
    Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
@@ -100,6 +101,8 @@ int main()
        gamma_dev.GetDeviceBuffer(),
        beta_dev.GetDeviceBuffer(),
        y_dev.GetDeviceBuffer(),
+        nullptr,
+        nullptr,
        PassThrough{});
    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
@@ -130,8 +133,7 @@ int main()
        ref_invoker.Run(ref_argument);
        y_dev.FromDevice(y.mData.data());
-        pass &=
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);
-            ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
    }
    return (pass ? 0 : 1);
 }
--- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -297,33 +298,19 @@ int main(int argc, char* argv[])
        const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
        const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
-        Tensor<ADataType> a_ms_ks(
+        Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
-            std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
+        Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
-            std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
+        Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
-        Tensor<BDataType> b_ns_ks(
+        Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
-            std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
-            std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
+        ck::index_t M_ =
-        Tensor<DDataType> d_ms_ns(
+            ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
-            std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()),
-            std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end()));
+        ck::index_t N_ = ck::accumulate_n<ck::index_t>(
-        Tensor<EDataType> e_ms_ns_device_result(
+            e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
-            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        ck::index_t K_ = ck::accumulate_n<ck::index_t>(
+            a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
-        ck::index_t M_ = std::accumulate(e_ms_ns_lengths.begin(),
-                                         e_ms_ns_lengths.begin() + NumDimM,
-                                         ck::index_t{1},
-                                         std::multiplies<ck::index_t>{});
-        ck::index_t N_ = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
-                                         e_ms_ns_lengths.begin() + NumDimM + NumDimN,
-                                         ck::index_t{1},
-                                         std::multiplies<ck::index_t>{});
-        ck::index_t K_ = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
-                                         a_ms_ks_lengths.begin() + NumDimM + NumDimK,
-                                         ck::index_t{1},
-                                         std::multiplies<ck::index_t>{});
        a_tensors.push_back(a_ms_ks);
        b_tensors.push_back(b_ns_ks);
@@ -423,13 +410,9 @@ int main(int argc, char* argv[])
            const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
            const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
-            Tensor<EDataType> c_ms_ns_host_result(
+            Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-                std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-                std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
-            Tensor<EDataType> e_ms_ns_host_result(
+            Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-                std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
-                std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
            e_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data());
@@ -475,7 +458,7 @@ int main(int argc, char* argv[])
                }
            }
-            pass &= ck::utils::check_err(e_device_tensors[i].mData, e_ms_ns_host_result.mData);
+            pass &= ck::utils::check_err(e_device_tensors[i], e_ms_ns_host_result);
        }
    }

--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -246,21 +247,11 @@ int main(int argc, char* argv[])
        exit(0);
    }
-    Tensor<ADataType> a_gs_ms_ks(
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
-        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
-        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
-    Tensor<BDataType> b_gs_ns_ks(
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
-    Tensor<DDataType> d_gs_ms_ns(
-        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_host_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_gs_ms_ns_device_result(
-        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -327,25 +318,17 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    ck::index_t G = std::accumulate(e_gs_ms_ns_lengths.begin(),
+    ck::index_t G =
-                                    e_gs_ms_ns_lengths.begin() + NumDimG,
+        ck::accumulate_n<ck::index_t>(e_gs_ms_ns_lengths.begin(), NumDimG, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
+    ck::index_t M = ck::accumulate_n<ck::index_t>(
-                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+        e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
-                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
+        e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
-    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
-                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
+        a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});
-                                    ck::index_t{1},
-                                    std::multiplies<ck::index_t>{});
    std::size_t flop      = std::size_t(2) * G * M * N * K;
    std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N +
@@ -362,9 +345,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
-            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
        using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
                                                                     NumDimM,
@@ -409,9 +390,7 @@ int main(int argc, char* argv[])
            }
        }
-        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
-                   ? 0
-                   : 1;
    }
    return 0;

--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+add_custom_target(example_grouped_conv_fwd_multiple_d)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
+  add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
+endif() # USE_BITINT_EXTENSION_INT4
+add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
--- a/example/30_grouped_conv_fwd_multiple_d/README.md
+++ b/example/30_grouped_conv_fwd_multiple_d/README.md
+Command
+```bash
+arg1: verification (0=no, 1=yes)
+arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+arg3: time kernel (0=no, 1=yes)
+Following arguments (depending on number of spatial dims):
+ Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
+ G, N, K, C,
+ <filter spatial dimensions>, (ie Y, X for 2D)
+ <input image spatial dimensions>, (ie Hi, Wi for 2D)
+ <strides>, (ie Sy, Sx for 2D)
+ <dilations>, (ie Dy, Dx for 2D)
+ <left padding>, (ie LeftPy, LeftPx for 2D)
+ <right padding>, (ie RightPy, RightPx for 2D)
+./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
+```
+Result (MI100)
+```
+in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
+wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
+bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
+```
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
+#pragma once
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include <algorithm>
+#include <array>
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include <iostream>
+#include <string>
-// kernel data types
+#include <type_traits>
-using InKernelDataType       = int8_t;
-using WeiKernelDataType      = int8_t;
-using AccDataType            = int32_t;
-using CShuffleDataType       = int8_t;
-using BiasKernelDataType     = int8_t;
-using ResidualKernelDataType = int8_t;
-using OutKernelDataType      = int8_t;
-// tensor data types
+#include "ck/ck.hpp"
-using InUserDataType  = ck::int4_t;
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-using WeiUserDataType = ck::int4_t;
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-using OutUserDataType = ck::int4_t;
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+using BF16 = ck::bhalf_t;
+using FP16 = ck::half_t;
+using FP32 = float;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using I4 = ck::int4_t;
+#endif
+using I8  = std::int8_t;
+using I32 = std::int32_t;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-template <ck::index_t NDimSpatial,
+template <typename InputLay, typename WeightLay, typename OutputLay>
-          typename InLayout,
+struct CommonLayoutSetting
-          typename WeiLayout,
+{
-          typename BiasLayout,
+    using InputLayout  = InputLay;
-          typename ResidualLayout,
+    using WeightLayout = WeightLay;
-          typename OutLayout>
+    using OutputLayout = OutputLay;
-using DeviceGroupedConvNDFwdInstance =
+};
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-        NDimSpatial,
+template <ck::index_t NDimSpatial>
-        InLayout,
+struct CommonLayoutSettingSelector;
-        WeiLayout,
-        ck::Tuple<BiasLayout, ResidualLayout>,
+namespace ctl = ck::tensor_layout::convolution;
-        OutLayout,
-        InKernelDataType,
+template <>
-        WeiKernelDataType,
+struct CommonLayoutSettingSelector<1> final
-        AccDataType,
+    : CommonLayoutSetting<ctl::G_NW_C, ctl::G_K_X_C, ctl::G_NW_K>
-        CShuffleDataType,
-        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
-        OutKernelDataType,
-        InElementOp,
-        WeiElementOp,
-        OutElementOp,
-        ConvSpec,    // ConvForwardSpecialization
-        GemmSpec,    // GemmSpecialization
-        1,           //
-        256,         // BlockSize
-        128,         // MPerBlock
-        256,         // NPerBlock
-        64,          // KPerBlock
-        16,          // AK1
-        16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
-        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-        2,           // ABlockTransferSrcVectorDim
-        16,          // ABlockTransferSrcScalarPerVector
-        16,          // ABlockTransferDstScalarPerVector_AK1
-        1,           // ABlockLdsExtraM
-        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-        2,           // BBlockTransferSrcVectorDim
-        16,          // BBlockTransferSrcScalarPerVector
-        16,          // BBlockTransferDstScalarPerVector_BK1
-        1,           // BBlockLdsExtraN
-        1,
-        1,
-        S<1, 64, 1, 4>,
-        16>;
-int main(int argc, char* argv[])
 {
-    namespace ctc = ck::tensor_layout::convolution;
+};
-    print_helper_msg();
+template <>
+struct CommonLayoutSettingSelector<2> final
+    : CommonLayoutSetting<ctl::G_NHW_C, ctl::G_K_YX_C, ctl::G_NHW_K>
+{
+};
+template <>
+struct CommonLayoutSettingSelector<3> final
+    : CommonLayoutSetting<ctl::G_NDHW_C, ctl::G_K_ZYX_C, ctl::G_NDHW_K>
+{
+};
+template <ck::index_t NDimSpatial>
+using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;
+template <ck::index_t NDimSpatial>
+using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;
+template <ck::index_t NDimSpatial>
+using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;
+struct ExecutionConfig final
+{
    bool do_verification = true;
    int init_method      = 1;
-    bool time_kernel     = false;
+    bool time_kernel     = true;
+};
-    // conventional group conv definition
-    // G = 2
+#define DefaultConvParam                                                       \
-    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
+    ck::utils::conv::ConvParam                                                 \
-    // [K, C,  Y,  X] =  [512, 192,  3,  3]
+    {                                                                          \
-    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
+        2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
-    // CK group conv definition
+    }
-    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
-    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
+inline void print_help_msg()
-    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
+{
-    ck::utils::conv::ConvParam conv_param{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
-        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck::utils::conv::ConvParam& conv_param)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
    if(argc == 1)
    {
        // use default
    }
-    else if(argc == 4)
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
    {
-        do_verification = std::stoi(argv[1]);
+        config.do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
+        config.init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        config.time_kernel     = std::stoi(argv[3]);
    }
-    else
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
    {
-        do_verification                   = std::stoi(argv[1]);
+        config.do_verification = std::stoi(argv[1]);
-        init_method                       = std::stoi(argv[2]);
+        config.init_method     = std::stoi(argv[2]);
-        time_kernel                       = std::stoi(argv[3]);
+        config.time_kernel     = std::stoi(argv[3]);
-        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
-        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param                        = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
    }
-    const auto in_element_op  = InElementOp{};
+    return true;
-    const auto wei_element_op = WeiElementOp{};
+}
-    const auto out_element_op = OutElementOp{};
-    if(conv_param.num_dim_spatial_ == 1)
+inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
    {
-        using InLayout       = ctc::G_NW_C;
+    case 1:
-        using WeiLayout      = ctc::G_K_X_C;
+        return HostTensorDescriptor(
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NW_K;
-        using OutLayout      = ctc::G_NW_K;
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
@@ -150,81 +162,8 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.C_                                         // wi
            });
-        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+    case 2:
-            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+        return HostTensorDescriptor(
-            {
-                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
-                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
-                1,                                                                     // c
-                conv_param.C_                                                          // x
-            });
-        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_, // g
-                0,             // k
-                1,             // c
-                0              // x
-            });
-        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_, // g
-                0,             // k
-                1,             // c
-                0              // x
-            });
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-            {
-                conv_param.K_,                                                         // g
-                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
-                1,                                                                     // k
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<1,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    else if(conv_param.num_dim_spatial_ == 2)
-    {
-        using InLayout       = ctc::G_NHW_C;
-        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NHW_K;
-        using OutLayout      = ctc::G_NHW_K;
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
@@ -239,104 +178,8 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.C_                                         // wi
            });
-        const auto wei_g_k_c_xs_desc =
+    case 3:
-            HostTensorDescriptor({conv_param.G_,
+        return HostTensorDescriptor(
-                                  conv_param.K_,
-                                  conv_param.C_,
-                                  conv_param.filter_spatial_lengths_[0],
-                                  conv_param.filter_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
-                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
-                                     conv_param.filter_spatial_lengths_[0] *
-                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
-                                     1,                                                         // c
-                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
-                                     conv_param.C_                                              // x
-                                 });
-        const auto bias_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // ho
-                                     0              // wo
-                                 });
-        const auto residual_g_n_k_wos_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1]},
-                                 {
-                                     conv_param.K_, // g
-                                     0,             // n
-                                     1,             // k
-                                     0,             // ho
-                                     0              // wo
-                                 });
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
-            {conv_param.G_,
-             conv_param.N_,
-             conv_param.K_,
-             conv_param.output_spatial_lengths_[0],
-             conv_param.output_spatial_lengths_[1]},
-            {
-                conv_param.K_, // g
-                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
-                    conv_param.G_ * conv_param.K_,                                     // n
-                1,                                                                     // k
-                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
-                conv_param.G_ * conv_param.K_                                          // wo
-            });
-        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<2,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-    }
-    else if(conv_param.num_dim_spatial_ == 3)
-    {
-        using InLayout       = ctc::G_NDHW_C;
-        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_K;
-        using ResidualLayout = ctc::G_NDHW_K;
-        using OutLayout      = ctc::G_NDHW_K;
-        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
@@ -353,8 +196,42 @@ int main(int argc, char* argv[])
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
+    }
-        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.C_,                                     // k
+                1,                                                     // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+    case 3:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
@@ -373,40 +250,89 @@ int main(int argc, char* argv[])
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
+    }
-        const auto bias_g_n_k_wos_desc =
+    throw std::runtime_error("unsuppored # dim spatial");
-            HostTensorDescriptor({conv_param.G_,
+}
-                                  conv_param.N_,
-                                  conv_param.K_,
+inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param)
-                                  conv_param.output_spatial_lengths_[0],
+{
-                                  conv_param.output_spatial_lengths_[1],
+    switch(conv_param.num_dim_spatial_)
-                                  conv_param.output_spatial_lengths_[2]},
+    {
-                                 {
+    case 1:
-                                     conv_param.K_, // g
+        return HostTensorDescriptor(
-                                     0,             // n
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
-                                     1,             // k
+            {
-                                     0,             // z
+                conv_param.K_, // g
-                                     0,             // y
+                0,             // k
-                                     0              // x
+                1,             // c
-                                 });
+                0              // x
+            });
-        const auto residual_g_n_k_wos_desc =
+    case 2:
-            HostTensorDescriptor({conv_param.G_,
+        return HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
+                                     conv_param.N_,
-                                  conv_param.K_,
+                                     conv_param.K_,
-                                  conv_param.output_spatial_lengths_[0],
+                                     conv_param.output_spatial_lengths_[0],
-                                  conv_param.output_spatial_lengths_[1],
+                                     conv_param.output_spatial_lengths_[1]},
-                                  conv_param.output_spatial_lengths_[2]},
+                                    {
-                                 {
+                                        conv_param.K_, // g
-                                     conv_param.K_, // g
+                                        0,             // n
-                                     0,             // n
+                                        1,             // k
-                                     1,             // k
+                                        0,             // ho
-                                     0,             // z
+                                        0              // wo
-                                     0,             // y
+                                    });
-                                     0              // x
+    case 3:
-                                 });
+        return HostTensorDescriptor({conv_param.G_,
+                                     conv_param.N_,
-        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+                                     conv_param.K_,
+                                     conv_param.output_spatial_lengths_[0],
+                                     conv_param.output_spatial_lengths_[1],
+                                     conv_param.output_spatial_lengths_[2]},
+                                    {
+                                        conv_param.K_, // g
+                                        0,             // n
+                                        1,             // k
+                                        0,             // z
+                                        0,             // y
+                                        0              // x
+                                    });
+    }
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+    case 3:
+        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
@@ -423,37 +349,7 @@ int main(int argc, char* argv[])
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
-        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InKernelDataType,
-                                                  WeiKernelDataType,
-                                                  CShuffleDataType,
-                                                  OutKernelDataType,
-                                                  InElementOp,
-                                                  WeiElementOp,
-                                                  OutElementOp,
-                                                  InUserDataType,
-                                                  WeiUserDataType,
-                                                  OutUserDataType,
-                                                  DeviceGroupedConvNDFwdInstance<3,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 BiasLayout,
-                                                                                 ResidualLayout,
-                                                                                 OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            bias_g_n_k_wos_desc,
-            residual_g_n_k_wos_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
    }
-    return 0;
+    throw std::runtime_error("unsuppored # dim spatial");
 }