Merge remote-tracking branch 'origin/develop' into wavelet_model

95a83c6e · Adam Osewski · 5b7c2432 · 892a8d76 · 95a83c6e · 95a83c6e
Commit 95a83c6e authored Nov 18, 2022 by Adam Osewski
20 changed files
--- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -6,6 +6,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp"
+#include "ck/library/utility/literals.hpp"
 using F16 = ck::half_t;
 using F32 = float;
@@ -135,15 +137,15 @@ int main(int argc, char* argv[])
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -240,7 +242,7 @@ int main(int argc, char* argv[])
            show_2d_matrix(std::cout << "c_host  :", c_m_n_host_result) << std::endl;
        }
 #endif
-        ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
    }
    return 0;

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -32,14 +32,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    {
    case 0: break;
    case 1:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(),
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
-                                                                             a_m_k.end());
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
-                                                                             b_k_n.end());
        break;
    default:
-        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end());
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n.begin(), b_k_n.end());
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
    }
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
@@ -133,11 +131,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
        c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>();
-        return ck::utils::check_err(c_m_n_device_result_converted.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result);
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
 #endif
    }

--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -14,6 +14,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -177,15 +178,15 @@ int main(int argc, char* argv[])
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -271,8 +272,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_m_n(HostTensorDescriptor(
+        Tensor<CShuffleDataType> c_m_n({M, N});
-            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                                BDataType,
@@ -299,7 +299,7 @@ int main(int argc, char* argv[])
        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
-        return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
    }
    return 0;

--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -15,6 +15,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -155,15 +156,15 @@ int main(int argc, char* argv[])
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
            }
        }
-        return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
    }
    return 0;

--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
@@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
    if(config.do_verification)
    {
-        Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
+        Tensor<AccDataType> c_m_n({M, N});
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
@@ -147,9 +147,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
 #ifdef BUILD_INT4_EXAMPLE
        const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result);
-        return ck::utils::check_err(e_m_n_device_result_converted.mData, e_m_n_host_result.mData);
+        return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
 #else
-        return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
 #endif
    }

--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -4,3 +4,7 @@ add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
 add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
 # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
 add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
+add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
+add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
+add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -84,7 +85,7 @@ bool run_grouped_conv_fwd(bool do_verification,
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
    std::array<ck::index_t, NDimSpatial> input_right_pads{};
-    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
@@ -164,7 +165,7 @@ bool run_grouped_conv_fwd(bool do_verification,
        out_device_buf.FromDevice(out_device.mData.data());
        return ck::utils::check_err(
-            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+            out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
    }
    return true;

--- a/example/20_convnd_bwd_weight/convnd_bwd_weight_common.hpp
+++ b/example/20_convnd_bwd_weight/convnd_bwd_weight_common.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
 #include <iostream>
 #include <numeric>
-#include <initializer_list>
+#include <type_traits>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 void print_helper_msg()
 {
@@ -33,77 +34,97 @@ template <ck::index_t NDimSpatial,
          typename InElementOp,
          typename WeiElementOp,
          typename OutElementOp,
-          typename DeviceConvBwdWeightInstance>
+          typename DeviceConvNDFwdInstance>
-int run_conv_bwd_weight(bool do_verification,
+bool run_grouped_conv_fwd_dl(bool do_verification,
-                        int init_method,
+                             int init_method,
-                        bool time_kernel,
+                             bool time_kernel,
-                        const ck::utils::conv::ConvParam& conv_param,
+                             const ck::utils::conv::ConvParam& conv_param,
-                        const HostTensorDescriptor& in_g_n_c_wis_desc,
+                             const HostTensorDescriptor& in_g_n_c_wis_desc,
-                        const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                             const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                        const HostTensorDescriptor& out_g_n_k_wos_desc,
+                             const HostTensorDescriptor& out_g_n_k_wos_desc,
-                        const InElementOp& in_element_op,
+                             const InElementOp& in_element_op,
-                        const WeiElementOp& wei_element_op,
+                             const WeiElementOp& wei_element_op,
-                        const OutElementOp& out_element_op,
+                             const OutElementOp& out_element_op)
-                        ck::index_t split_k)
 {
    Tensor<InDataType> in(in_g_n_c_wis_desc);
-    Tensor<WeiDataType> wei_host_result(wei_g_k_c_xs_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
-    Tensor<WeiDataType> wei_device_result(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
-    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
    std::cout << "in: " << in.mDesc << std::endl;
-    std::cout << "wei: " << wei_host_result.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
-    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
    switch(init_method)
    {
    case 0: break;
    case 1:
        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
        break;
-    default:
+    case 2:
        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
+        wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
    }
    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
    in_device_buf.ToDevice(in.mData.data());
-    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
-    // init to 0
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
-    wei_device_buf.SetZero();
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
-    // do GEMM
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
-    auto conv     = DeviceConvBwdWeightInstance{};
+    std::array<ck::index_t, NDimSpatial + 3> c_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> c_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), c_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), c_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
    auto invoker  = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
-                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                      wei_device_buf.GetDeviceBuffer(),
-                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      out_device_buf.GetDeviceBuffer(),
-                                      conv_param.N_,
+                                      a_g_n_c_wis_lengths,
-                                      conv_param.K_,
+                                      a_g_n_c_wis_strides,
-                                      conv_param.C_,
+                                      b_g_k_c_xs_lengths,
-                                      conv_param.input_spatial_lengths_,
+                                      b_g_k_c_xs_strides,
-                                      conv_param.filter_spatial_lengths_,
+                                      c_g_n_k_wos_lengths,
-                                      conv_param.output_spatial_lengths_,
+                                      c_g_n_k_wos_strides,
-                                      conv_param.conv_filter_strides_,
+                                      conv_filter_strides,
-                                      conv_param.conv_filter_dilations_,
+                                      conv_filter_dilations,
-                                      conv_param.input_left_pads_,
+                                      input_left_pads,
-                                      conv_param.input_right_pads_,
+                                      input_right_pads,
                                      in_element_op,
                                      wei_element_op,
-                                      out_element_op,
+                                      out_element_op);
-                                      split_k);
    if(!conv.IsSupportedArgument(argument))
    {
-        std::cout << "wrong! device_conv with the specified compilation parameters does "
+        return true;
-                     "not support this Conv problem"
-                  << std::endl;
-        return 1;
    }
    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
@@ -111,42 +132,40 @@ int run_conv_bwd_weight(bool do_verification,
    std::size_t flop      = conv_param.GetFlops();
    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
-    float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << conv.GetTypeString() << std::endl;
    if(do_verification)
    {
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
-                                                                           InDataType,
+                                                                     InDataType,
-                                                                           WeiDataType,
+                                                                     WeiDataType,
-                                                                           OutDataType,
+                                                                     OutDataType,
-                                                                           InElementOp,
+                                                                     InElementOp,
-                                                                           WeiElementOp,
+                                                                     WeiElementOp,
-                                                                           OutElementOp>{};
+                                                                     OutElementOp>();
-        auto ref_invoker = ref_conv.MakeInvoker();
+        auto ref_invoker  = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(in,
-                                                  wei_host_result,
+                                                  wei,
-                                                  out,
+                                                  out_host,
                                                  conv_param.conv_filter_strides_,
                                                  conv_param.conv_filter_dilations_,
                                                  conv_param.input_left_pads_,
                                                  conv_param.input_right_pads_,
-                                                  InElementOp{},
+                                                  in_element_op,
-                                                  WeiElementOp{},
+                                                  wei_element_op,
-                                                  OutElementOp{});
+                                                  out_element_op);
        ref_invoker.Run(ref_argument);
-        wei_device_buf.FromDevice(wei_device_result.mData.data());
+        out_device_buf.FromDevice(out_device.mData.data());
-        return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(
+            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
    }
-    return 0;
+    return true;
 }
--- a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_dl_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using AccDataType = float;
+using OutDataType = ck::half_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+// clang-format off
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
+// ######|        NDim|     InData|     WeiData|     OutData|     AccData| InLayout| WeiLayout| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|     Spatial|       Type|        Type|        Type|        Type|         |          |          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|            |           |            |            |            |         |          |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |            |            |         |          |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+// clang-format on
+#include "run_convnd_fwd_dl_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
--- a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_dl_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType  = float;
+using WeiDataType = float;
+using AccDataType = float;
+using OutDataType = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+// clang-format off
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
+// ######|        NDim|     InData|     WeiData|     OutData|     AccData| InLayout| WeiLayout| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|     Spatial|       Type|        Type|        Type|        Type|         |          |          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|            |           |            |            |            |         |          |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |            |            |         |          |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+// clang-format on
+#include "run_convnd_fwd_dl_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
--- a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_dl_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType  = int8_t;
+using WeiDataType = int8_t;
+using AccDataType = int32_t;
+using OutDataType = int8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+// clang-format off
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
+// ######|        NDim|     InData|     WeiData|     OutData|     AccData| InLayout| WeiLayout| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|     Spatial|       Type|        Type|        Type|        Type|         |          |          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|            |           |            |            |            |         |          |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |            |            |         |          |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+// clang-format on
+#include "run_convnd_fwd_dl_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
--- a/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+bool run_convnd_fwd_dl_example(int argc, char* argv[])
+{
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    ck::utils::conv::ConvParam conv_param{
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    const auto run = [&](auto ndim_spatial, auto in_layout, auto wei_layout, auto out_layout) {
+        constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
+        std::cout << "ndim_spatial_value: " << ndim_spatial_value << std::endl;
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+        return run_grouped_conv_fwd_dl<
+            ndim_spatial_value,
+            InDataType,
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    };
+    namespace ctc = ck::tensor_layout::convolution;
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GNWK{});
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GNHWK{});
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GNDHWK{});
+    }
+    return true;
+}
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -16,6 +16,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
 {
    std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
-    std::copy(begin(problem_size.output_spatial_lengths_),
+    ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));
-              end(problem_size.output_spatial_lengths_),
-              std::back_inserter(dimensions));
    return HostTensorDescriptor(dimensions);
 }
@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
    assert(size(descriptor.GetStrides()) == size(strides));
    std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
 }
-template <typename Range, typename OutputIterator>
-auto copy(const Range& range, OutputIterator iter)
-    -> decltype(std::copy(std::begin(range), std::end(range), iter))
-{
-    return std::copy(std::begin(range), std::end(range), iter);
-}
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
    {
    case 0: break;
    case 1:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(),
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
-                                                                         conv_input.end());
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
-                                                                         conv_weight.end());
        break;
    default:
-        ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end());
+        ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
-        ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(),
+        ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
-                                                             conv_weight.end());
    }
    DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
        conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
    unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
-    copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
+    ck::ranges::copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
-    copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
+    ck::ranges::copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
-    copy(problem_size.input_left_pads_, begin(input_left_pads));
+    ck::ranges::copy(problem_size.input_left_pads_, begin(input_left_pads));
-    copy(problem_size.input_right_pads_, begin(input_right_pads));
+    ck::ranges::copy(problem_size.input_right_pads_, begin(input_right_pads));
    // run Conv + Reduction on device
    auto conv     = DeviceInstance<NDimSpatial>{};
@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
        conv_output_device_buf.FromDevice(conv_output_device.mData.data());
        r0_device_buf.FromDevice(r0_device.mData.data());
-        return ck::utils::check_err(conv_output_device.mData,
+        return ck::utils::check_err(conv_output_device,
-                                    conv_output_host.mData,
+                                    conv_output_host,
                                    "Error: incorrect results! (Matrix E)",
                                    1e-5f,
                                    1e-4f) &&
-               ck::utils::check_err(r0_device.mData,
+               ck::utils::check_err(
-                                    r0_host.mData,
+                   r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
-                                    "Error: incorrect results! (Matrix R0)",
-                                    1e-5f,
-                                    1e-4f);
    }
    return true;

--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
        if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
            return;
+        std::array<int, ShapeType::NumReduceDim_> arrReduceDims;
+        ck::ranges::copy(reduceDims, arrReduceDims.begin());
        result = reduce_blockwise_impl<InOutDataType,
                                       AccDataType,
                                       ReduceOpId,
@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
                                       ShapeType::NumReduceDim_,
                                       PropagateNan,
                                       OutputIndex>(
-            do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta);
+            do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
        matched = true;
    });

--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -30,7 +31,7 @@ int reduce_blockwise_impl(bool do_verification,
                          int init_method,
                          bool time_kernel,
                          const std::vector<size_t>& inLengths,
-                          const std::vector<int>& reduceDims,
+                          const std::array<int, NumReduceDim>& reduceDims,
                          float alpha,
                          float beta)
@@ -38,6 +39,8 @@ int reduce_blockwise_impl(bool do_verification,
    using namespace ck;
    using namespace ck::tensor_operation::device;
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
    constexpr bool op_support_indices =
        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
         ReduceOpId == ReduceTensorOp::AMAX);
@@ -143,7 +146,7 @@ int reduce_blockwise_impl(bool do_verification,
    std::vector<size_t> outLengths;
-    std::vector<int> invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+    auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
    if(invariantDims.empty())
        outLengths.push_back(1);
@@ -256,22 +259,22 @@ int reduce_blockwise_impl(bool do_verification,
                       acc_elementwise_op);
    };
-    std::vector<ck::index_t> i_inLengths;
+    std::array<index_t, Rank> arrInLengths;
-    std::vector<ck::index_t> i_inStrides;
+    std::array<index_t, Rank> arrInStrides;
-    std::vector<ck::index_t> i_outLengths;
+    std::array<index_t, NumOutDim> arrOutLengths;
-    std::vector<ck::index_t> i_outStrides;
+    std::array<index_t, NumOutDim> arrOutStrides;
-    i_inLengths.assign(inLengths.begin(), inLengths.end());
+    ck::ranges::copy(inLengths, arrInLengths.begin());
-    i_inStrides.assign(inStrides.begin(), inStrides.end());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
-    i_outLengths.assign(outLengths.begin(), outLengths.end());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    i_outStrides.assign(outStrides.begin(), outStrides.end());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
    auto reduce = DeviceReduceInstance{};
-    auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths,
+    auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
-                                                   i_inStrides,
+                                                   arrInStrides,
-                                                   i_outLengths,
+                                                   arrOutLengths,
-                                                   i_outStrides,
+                                                   arrOutStrides,
                                                   reduceDims,
                                                   alpha,
                                                   beta,
@@ -322,12 +325,12 @@ int reduce_blockwise_impl(bool do_verification,
 #endif
            out_dev.FromDevice(out.mData.data());
-        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+        pass = pass && ck::utils::check_err(out, out_ref);
        if(OutputIndex)
        {
            out_index_dev.FromDevice(out_indices.mData.data());
-            pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
+            pass = pass && ck::utils::check_err(out_indices, out_indices_ref);
        };
    };

--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -90,15 +90,15 @@ static bool time_kernel;
 int main(int argc, char* argv[])
 {
    // used by the device reduction
-    const std::vector<int> reduceDims_1    = {4};
+    const std::array<int, 1> reduceDims_1 = {4};
-    const std::vector<int> invariantDims_1 = {0, 1, 2, 3};
+    // const std::array<int, 4> invariantDims_1 = {0, 1, 2, 3};
-    const std::vector<int> reduceDims_2    = {3};
+    const std::array<int, 1> reduceDims_2 = {3};
-    const std::vector<int> invariantDims_2 = {0, 1, 2};
+    // const std::array<int, 3> invariantDims_2 = {0, 1, 2};
    // used by the host reduction
-    const std::vector<int> reduceDims    = {3, 4};
+    const std::array<int, 2> reduceDims    = {3, 4};
-    const std::vector<int> invariantDims = {0, 1, 2};
+    const std::array<int, 3> invariantDims = {0, 1, 2};
    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
                       acc_elementwise_op);
    };
-    std::vector<ck::index_t> i_inLengths_1;
+    std::array<index_t, 5> arrInLengths_1;
-    std::vector<ck::index_t> i_inStrides_1;
+    std::array<index_t, 5> arrInStrides_1;
-    std::vector<ck::index_t> i_inLengths_2;
+    std::array<index_t, 4> arrInLengths_2;
-    std::vector<ck::index_t> i_inStrides_2;
+    std::array<index_t, 4> arrInStrides_2;
-    std::vector<ck::index_t> i_outLengths;
+    std::array<index_t, 3> arrOutLengths;
-    std::vector<ck::index_t> i_outStrides;
+    std::array<index_t, 3> arrOutStrides;
-    i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end());
+    ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
-    i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end());
+    ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
-    i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end());
+    ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
-    i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end());
+    ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
-    i_outLengths.assign(outLengths.begin(), outLengths.end());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    i_outStrides.assign(outStrides.begin(), outStrides.end());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
    auto reduce_1 = DeviceReduceInstance_1{};
-    auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1,
+    auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1,
-                                                       i_inStrides_1,
+                                                       arrInStrides_1,
-                                                       i_inLengths_2,
+                                                       arrInLengths_2,
-                                                       i_inStrides_2,
+                                                       arrInStrides_2,
                                                       reduceDims_1,
                                                       1.0f,
                                                       0.0f,
@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
    auto reduce_2 = DeviceReduceInstance_2{};
-    auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2,
+    auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2,
-                                                       i_inStrides_2,
+                                                       arrInStrides_2,
-                                                       i_outLengths,
+                                                       arrOutLengths,
-                                                       i_outStrides,
+                                                       arrOutStrides,
                                                       reduceDims_2,
                                                       alpha,
                                                       beta,
@@ -294,7 +294,7 @@ int main(int argc, char* argv[])
    if(do_verify)
    {
        out_dev.FromDevice(out.mData.data());
-        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+        pass = pass && ck::utils::check_err(out, out_ref);
    };
    return (pass ? 0 : 1);

--- a/example/12_reduce/reduce_example_common.hpp
+++ b/example/12_reduce/reduce_example_common.hpp
@@ -5,11 +5,10 @@
 #include "ck/ck.hpp"
-template <ck::index_t Rank, ck::index_t NumReduceDim>
+template <int Rank, int NumReduceDim>
-std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
+static inline std::array<int, Rank - NumReduceDim>
+get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
 {
-    assert(NumReduceDim == reduceDims.size());
    int reduceFlag = 0;
    // flag the bits for the reduceDims
@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
        reduceFlag |= 1 << reduceDims[i];
    };
-    std::vector<int> invariantDims;
+    std::array<int, Rank - NumReduceDim> invariantDims;
    // collect invariant dimensions
+    int dim = 0;
    for(int i = 0; i < Rank; i++)
        if((reduceFlag & (1 << i)) == 0)
        {
-            invariantDims.push_back(i);
+            invariantDims[dim] = i;
+            dim++;
        };
    return invariantDims;

--- a/example/12_reduce/reduce_multiblock_atomic_add.cpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp
@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
        if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
            return;
+        std::array<int, ShapeType::NumReduceDim_> a_reduceDims;
+        ck::ranges::copy(reduceDims, a_reduceDims.begin());
        result = reduce_multiblock_atomic_add_impl<InOutDataType,
                                                   AccDataType,
                                                   ReduceOpId,
                                                   ShapeType::Rank_,
                                                   ShapeType::NumReduceDim_,
                                                   PropagateNan>(
-            do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta);
+            do_verification, init_method, time_kernel, inLengths, a_reduceDims, alpha, beta);
        matched = true;
    });

--- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -29,7 +30,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
                                      int init_method,
                                      bool time_kernel,
                                      const std::vector<size_t>& inLengths,
-                                      const std::vector<int>& reduceDims,
+                                      const std::array<int, NumReduceDim>& reduceDims,
                                      float alpha,
                                      float beta)
@@ -37,6 +38,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
    using namespace ck;
    using namespace ck::tensor_operation::device;
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
    constexpr bool op_support_atomic_add =
        (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG);
@@ -84,7 +87,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
    std::vector<size_t> outLengths;
-    std::vector<int> invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+    auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
    if(invariantDims.empty())
        outLengths.push_back(1);
@@ -169,22 +172,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
                       acc_elementwise_op);
    };
-    std::vector<ck::index_t> i_inLengths;
+    std::array<index_t, Rank> arrInLengths;
-    std::vector<ck::index_t> i_inStrides;
+    std::array<index_t, Rank> arrInStrides;
-    std::vector<ck::index_t> i_outLengths;
+    std::array<index_t, NumOutDim> arrOutLengths;
-    std::vector<ck::index_t> i_outStrides;
+    std::array<index_t, NumOutDim> arrOutStrides;
-    i_inLengths.assign(inLengths.begin(), inLengths.end());
+    ck::ranges::copy(inLengths, arrInLengths.begin());
-    i_inStrides.assign(inStrides.begin(), inStrides.end());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
-    i_outLengths.assign(outLengths.begin(), outLengths.end());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
-    i_outStrides.assign(outStrides.begin(), outStrides.end());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
    auto reduce = DeviceReduceInstance{};
-    auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths,
+    auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
-                                                   i_inStrides,
+                                                   arrInStrides,
-                                                   i_outLengths,
+                                                   arrOutLengths,
-                                                   i_outStrides,
+                                                   arrOutStrides,
                                                   reduceDims,
                                                   alpha,
                                                   beta,
@@ -223,7 +226,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
    if(do_verification)
    {
        out_dev.FromDevice(out.mData.data());
-        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+        pass = pass && ck::utils::check_err(out, out_ref);
    };
    return (pass ? 0 : 1);