Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320)

* Implement multiple-reduction in one kernel (kernels, device ops, examples) * Add generic elementwise kernel and device interface * Add generator for normal-distributed data initialization * Add host refer implementation of batchnorm-forward and batchnorm-infer * Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels * Remove un-needed including in batchnorm example * Renaming generic_elementwise to elementiwise in kernel and device classes/functions * Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise * Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise * Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise * Add DeviceElementwiseBase and use it in device_normalize_instance.cpp * Removing and renaming files * Update to synchronize gemm_layernorm client example to the generic element-wise device op API * Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming * Merge two static member functions in device_elementwise.hpp * Remove unary_elementwise_1d kernel and device

Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320)
* Implement multiple-reduction in one kernel (kernels, device ops, examples) * Add generic elementwise kernel and device interface * Add generator for normal-distributed data initialization * Add host refer implementation of batchnorm-forward and batchnorm-infer * Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels * Remove un-needed including in batchnorm example * Renaming generic_elementwise to elementiwise in kernel and device classes/functions * Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise * Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise * Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise * Add DeviceElementwiseBase and use it in device_normalize_instance.cpp * Removing and renaming files * Update to synchronize gemm_layernorm client example to the generic element-wise device op API * Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming * Merge two static member functions in device_elementwise.hpp * Remove unary_elementwise_1d kernel and device
53ea4713 · Qianfeng · GitHub · 5ee30459 · 53ea4713 · 53ea4713
Unverified Commit 53ea4713 authored Aug 15, 2022 by Qianfeng Committed by GitHub Aug 15, 2022
20 changed files
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
@@ -128,11 +128,14 @@ bool RunDeviceNormalize2D(normalize_op_ptr& p_op,
    std::array<void*, 1> output      = {p_y};
    auto normalize_functor           = ck::tensor_operation::element_wise::Normalize{};

-    auto argument_ptr = p_op->MakeArgumentPointer(input,
+    std::array<ck::index_t, 2> xyLengths = {M, N};
+    std::array<ck::index_t, 2> xyStrides = {StrideX, 1};
+
+    auto argument_ptr = p_op->MakeArgumentPointer(xyLengths,
+                                                  {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+                                                  {xyStrides},
+                                                  input,
                                                  output,
-                                                  {M, N},
-                                                  {{StrideX, 1}, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
-                                                  {{StrideX, 1}},
                                                  ck::tensor_operation::element_wise::Normalize{});

    if(p_op->IsSupportedArgument(argument_ptr.get()))

--- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
@@ -6,7 +6,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -16,28 +16,23 @@
 using F16 = ck::half_t;
 using F32 = float;

-using ABDataType             = F16;
-using CDataType              = F16;
-using EltwiseComputeDataType = F32;
+using ABDataType = F16;
+using CDataType  = F16;

 using Add = ck::tensor_operation::element_wise::Add;

 using DeviceElementwiseAddInstance =
-    ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
-                                                          ABDataType,
-                                                          CDataType,
-                                                          EltwiseComputeDataType,
-                                                          Add,
-                                                          2,
-                                                          8,
-                                                          8,
-                                                          8,
-                                                          8>;
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
+                                                    ck::Tuple<CDataType>,
+                                                    Add,
+                                                    2,
+                                                    8,
+                                                    ck::Sequence<8, 8>,
+                                                    ck::Sequence<8>>;

 template <typename HostTensorA,
          typename HostTensorB,
          typename HostTensorC,
-          typename ComputeDataType,
          typename Functor,
          int broadcastDim>
 void host_broadcast2D(
@@ -49,19 +44,19 @@ void host_broadcast2D(
    {
        for(int n = 0; n < N; ++n)
        {
-            ComputeDataType Amn = ck::type_convert<ComputeDataType>(A(m, n));
-            ComputeDataType Cmn = 0;
+            auto Amn  = A(m, n);
+            ctype Cmn = 0;
            if constexpr(broadcastDim == 0)
            {
-                ComputeDataType Bn = ck::type_convert<ComputeDataType>(B(n));
+                auto Bn = B(n);
                functor(Cmn, Amn, Bn);
            }
            else
            {
-                ComputeDataType Bm = ck::type_convert<ComputeDataType>(B(m));
+                auto Bm = B(m);
                functor(Cmn, Amn, Bm);
            }
-            C(m, n) = ck::type_convert<ctype>(Cmn);
+            C(m, n) = Cmn;
        }
    }
 }
@@ -103,18 +98,19 @@ int main()
                                        b_n_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {c_m_n_device_buf.GetDeviceBuffer()};

-    std::vector<ck::index_t> a_strides = {Stride, 1};
-    std::vector<ck::index_t> b_strides = {0, 1};
-    std::vector<ck::index_t> c_strides = {Stride, 1};
+    std::array<ck::index_t, 2> abc_lengths = {M, N};
+    std::array<ck::index_t, 2> a_strides   = {Stride, 1};
+    std::array<ck::index_t, 2> b_strides   = {0, 1};
+    std::array<ck::index_t, 2> c_strides   = {Stride, 1};

    auto broadcastAdd = DeviceElementwiseAddInstance{};
    auto argument     = broadcastAdd.MakeArgumentPointer(
-        input, output, {M, N}, {a_strides, b_strides}, {c_strides}, Add{});
+        abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});

    if(!broadcastAdd.IsSupportedArgument(argument.get()))
    {
-        throw std::runtime_error("The runtime parameters seems not supported by the "
-                                 "DeviceBinaryElementwise instance, exiting!");
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
    };

    auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
@@ -129,12 +125,8 @@ int main()
        c_m_n_device_buf.FromDevice(c_m_n.mData.data());
        Tensor<CDataType> host_c_m_n(f_host_tensor_descriptor2d(M, N, Stride));

-        host_broadcast2D<Tensor<ABDataType>,
-                         Tensor<ABDataType>,
-                         Tensor<CDataType>,
-                         EltwiseComputeDataType,
-                         Add,
-                         0>(host_c_m_n, a_m_n, b_n, M, N, Add{});
+        host_broadcast2D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add, 0>(
+            host_c_m_n, a_m_n, b_n, M, N, Add{});

        pass &= ck::utils::check_err(
            c_m_n.mData, host_c_m_n.mData, "Error: Incorrect results c", 1e-3, 1e-3);

--- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
+++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
@@ -6,7 +6,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -16,29 +16,21 @@
 using F16 = ck::half_t;
 using F32 = float;

-using ABDataType             = F16;
-using CDataType              = F16;
-using EltwiseComputeDataType = F32;
+using ABDataType = F16;
+using CDataType  = F16;

 using Add = ck::tensor_operation::element_wise::Add;

 using DeviceElementwiseAddInstance =
-    ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
-                                                          ABDataType,
-                                                          CDataType,
-                                                          EltwiseComputeDataType,
-                                                          Add,
-                                                          3,
-                                                          8,
-                                                          1,
-                                                          8,
-                                                          8>;
-
-template <typename HostTensorA,
-          typename HostTensorB,
-          typename HostTensorC,
-          typename ComputeDataType,
-          typename Functor>
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
+                                                    ck::Tuple<CDataType>,
+                                                    Add,
+                                                    3,
+                                                    8,
+                                                    ck::Sequence<1, 8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
 void host_broadcast3D_am_bmnk(HostTensorC& C,
                              const HostTensorA& A,
                              const HostTensorB& B,
@@ -51,11 +43,11 @@ void host_broadcast3D_am_bmnk(HostTensorC& C,
        for(std::size_t n = 0; n < shape[1]; ++n)
            for(std::size_t k = 0; k < shape[2]; ++k)
            {
-                ComputeDataType a_val = ck::type_convert<ComputeDataType>(A(m));
-                ComputeDataType b_val = ck::type_convert<ComputeDataType>(B(m, n, k));
-                ComputeDataType c_val = 0;
+                auto a_val  = A(m);
+                auto b_val  = B(m, n, k);
+                ctype c_val = 0;
                functor(c_val, a_val, b_val);
-                C(m, n, k) = ck::type_convert<ctype>(c_val);
+                C(m, n, k) = c_val;
            }
 }

@@ -85,25 +77,25 @@ int main()
                                        b_m_n_k_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {c_m_n_k_device_buf.GetDeviceBuffer()};

-    std::vector<ck::index_t> a_strides = {1, 0, 0};
-    std::vector<ck::index_t> b_strides{b_m_n_k.mDesc.GetStrides().begin(),
-                                       b_m_n_k.mDesc.GetStrides().end()};
-    std::vector<ck::index_t> c_strides{c_m_n_k.mDesc.GetStrides().begin(),
-                                       c_m_n_k.mDesc.GetStrides().end()};
+    std::array<ck::index_t, 3> abc_lengths;
+    std::array<ck::index_t, 3> a_strides = {1, 0, 0};
+    std::array<ck::index_t, 3> b_strides;
+    std::array<ck::index_t, 3> c_strides;
+
+    std::copy(mnk.begin(), mnk.end(), abc_lengths.begin());
+    std::copy(
+        b_m_n_k.mDesc.GetStrides().begin(), b_m_n_k.mDesc.GetStrides().end(), b_strides.begin());
+    std::copy(
+        c_m_n_k.mDesc.GetStrides().begin(), c_m_n_k.mDesc.GetStrides().end(), c_strides.begin());

    auto broadcastAdd = DeviceElementwiseAddInstance{};
-    auto argument =
-        broadcastAdd.MakeArgumentPointer(input,
-                                         output,
-                                         std::vector<ck::index_t>{mnk.begin(), mnk.end()},
-                                         {a_strides, b_strides},
-                                         {c_strides},
-                                         Add{});
+    auto argument     = broadcastAdd.MakeArgumentPointer(
+        abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});

    if(!broadcastAdd.IsSupportedArgument(argument.get()))
    {
-        throw std::runtime_error("The runtime parameters seems not supported by the "
-                                 "DeviceBinaryElementwise instance, exiting!");
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
    };

    auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
@@ -118,11 +110,8 @@ int main()
        c_m_n_k_device_buf.FromDevice(c_m_n_k.mData.data());
        Tensor<CDataType> host_c_m_n_k(mnk);

-        host_broadcast3D_am_bmnk<Tensor<ABDataType>,
-                                 Tensor<ABDataType>,
-                                 Tensor<CDataType>,
-                                 EltwiseComputeDataType,
-                                 Add>(host_c_m_n_k, a_m, b_m_n_k, mnk, Add{});
+        host_broadcast3D_am_bmnk<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
+            host_c_m_n_k, a_m, b_m_n_k, mnk, Add{});

        pass &= ck::utils::check_err(
            c_m_n_k.mData, host_c_m_n_k.mData, "Error: Incorrect results c", 1e-3, 1e-3);

--- a/example/19_binary_elementwise/elementwise_add_1d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_1d.cpp
@@ -5,7 +5,7 @@
 #include <cstdlib>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -15,29 +15,21 @@
 using F16 = ck::half_t;
 using F32 = float;

-using ABDataType             = F16;
-using CDataType              = F16;
-using EltwiseComputeDataType = F32;
+using ABDataType = F16;
+using CDataType  = F16;

 using Add = ck::tensor_operation::element_wise::Add;

 using DeviceElementwiseAddInstance =
-    ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
-                                                          ABDataType,
-                                                          CDataType,
-                                                          EltwiseComputeDataType,
-                                                          Add,
-                                                          1,
-                                                          8,
-                                                          8,
-                                                          8,
-                                                          8>;
-
-template <typename HostTensorA,
-          typename HostTensorB,
-          typename HostTensorC,
-          typename ComputeDataType,
-          typename Functor>
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
+                                                    ck::Tuple<CDataType>,
+                                                    Add,
+                                                    1,
+                                                    8,
+                                                    ck::Sequence<8, 8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
 void host_elementwise1D(
    HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, Functor functor)
 {
@@ -45,11 +37,11 @@ void host_elementwise1D(

    for(int m = 0; m < M; ++m)
    {
-        ComputeDataType Am = ck::type_convert<ComputeDataType>(A(m));
-        ComputeDataType Bm = ck::type_convert<ComputeDataType>(B(m));
-        ComputeDataType Cm = 0;
+        auto Am  = A(m);
+        auto Bm  = B(m);
+        ctype Cm = 0;
        functor(Cm, Am, Bm);
-        C(m) = ck::type_convert<ctype>(Cm);
+        C(m) = Cm;
    }
 }

@@ -83,18 +75,19 @@ int main()
                                        b_m_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {c_m_device_buf.GetDeviceBuffer()};

-    std::vector<ck::index_t> a_strides = {1};
-    std::vector<ck::index_t> b_strides = {1};
-    std::vector<ck::index_t> c_strides = {1};
+    std::array<ck::index_t, 1> abc_lengths = {M};
+    std::array<ck::index_t, 1> a_strides   = {1};
+    std::array<ck::index_t, 1> b_strides   = {1};
+    std::array<ck::index_t, 1> c_strides   = {1};

    auto broadcastAdd = DeviceElementwiseAddInstance{};
    auto argument     = broadcastAdd.MakeArgumentPointer(
-        input, output, {M}, {{a_strides}, b_strides}, {c_strides}, Add{});
+        abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});

    if(!broadcastAdd.IsSupportedArgument(argument.get()))
    {
-        throw std::runtime_error("The runtime parameters seems not supported by the "
-                                 "DeviceBinaryElementwise instance, exiting!");
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
    };

    auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
@@ -109,11 +102,8 @@ int main()
        c_m_device_buf.FromDevice(c_m.mData.data());
        Tensor<CDataType> host_c_m(f_host_tensor_descriptor1d(M, 1));

-        host_elementwise1D<Tensor<ABDataType>,
-                           Tensor<ABDataType>,
-                           Tensor<CDataType>,
-                           EltwiseComputeDataType,
-                           Add>(host_c_m, a_m, b_m, M, Add{});
+        host_elementwise1D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
+            host_c_m, a_m, b_m, M, Add{});

        pass &= ck::utils::check_err(
            c_m.mData, host_c_m.mData, "Error: Incorrect results c", 1e-3, 1e-3);

--- a/example/19_binary_elementwise/elementwise_add_4d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_4d.cpp
@@ -6,7 +6,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -16,29 +16,21 @@
 using F16 = ck::half_t;
 using F32 = float;

-using ABDataType             = F16;
-using CDataType              = F16;
-using EltwiseComputeDataType = F32;
+using ABDataType = F16;
+using CDataType  = F16;

 using Add = ck::tensor_operation::element_wise::Add;

 using DeviceElementwiseAddInstance =
-    ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
-                                                          ABDataType,
-                                                          CDataType,
-                                                          EltwiseComputeDataType,
-                                                          Add,
-                                                          4,
-                                                          8,
-                                                          8,
-                                                          8,
-                                                          8>;
-
-template <typename HostTensorA,
-          typename HostTensorB,
-          typename HostTensorC,
-          typename ComputeDataType,
-          typename Functor>
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
+                                                    ck::Tuple<CDataType>,
+                                                    Add,
+                                                    4,
+                                                    8,
+                                                    ck::Sequence<8, 8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
 void host_elementwise4D(HostTensorC& C,
                        const HostTensorA& A,
                        const HostTensorB& B,
@@ -52,11 +44,11 @@ void host_elementwise4D(HostTensorC& C,
            for(std::size_t h = 0; h < shape[2]; ++h)
                for(std::size_t w = 0; w < shape[3]; ++w)
                {
-                    ComputeDataType a_val = ck::type_convert<ComputeDataType>(A(n, c, h, w));
-                    ComputeDataType b_val = ck::type_convert<ComputeDataType>(B(n, c, h, w));
-                    ComputeDataType c_val = 0;
+                    auto a_val  = A(n, c, h, w);
+                    auto b_val  = B(n, c, h, w);
+                    ctype c_val = 0;
                    functor(c_val, a_val, b_val);
-                    C(n, c, h, w) = ck::type_convert<ctype>(c_val);
+                    C(n, c, h, w) = c_val;
                }
 }

@@ -85,23 +77,24 @@ int main()
                                        b_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {c_device_buf.GetDeviceBuffer()};

-    std::vector<ck::index_t> a_strides{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()};
-    std::vector<ck::index_t> b_strides{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()};
-    std::vector<ck::index_t> c_strides{c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end()};
+    std::array<ck::index_t, 4> abc_lengths;
+    std::array<ck::index_t, 4> a_strides;
+    std::array<ck::index_t, 4> b_strides;
+    std::array<ck::index_t, 4> c_strides;
+
+    std::copy(nchw.begin(), nchw.end(), abc_lengths.begin());
+    std::copy(a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end(), a_strides.begin());
+    std::copy(b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end(), b_strides.begin());
+    std::copy(c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end(), c_strides.begin());

    auto broadcastAdd = DeviceElementwiseAddInstance{};
-    auto argument =
-        broadcastAdd.MakeArgumentPointer(input,
-                                         output,
-                                         std::vector<ck::index_t>{nchw.begin(), nchw.end()},
-                                         {{a_strides}, b_strides},
-                                         {c_strides},
-                                         Add{});
+    auto argument     = broadcastAdd.MakeArgumentPointer(
+        abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});

    if(!broadcastAdd.IsSupportedArgument(argument.get()))
    {
-        throw std::runtime_error("The runtime parameters seems not supported by the "
-                                 "DeviceBinaryElementwise instance, exiting!");
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
    };

    auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
@@ -116,11 +109,8 @@ int main()
        c_device_buf.FromDevice(c.mData.data());
        Tensor<CDataType> host_c(nchw);

-        host_elementwise4D<Tensor<ABDataType>,
-                           Tensor<ABDataType>,
-                           Tensor<CDataType>,
-                           EltwiseComputeDataType,
-                           Add>(host_c, a, b, nchw, Add{});
+        host_elementwise4D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
+            host_c, a, b, nchw, Add{});

        pass &=
            ck::utils::check_err(c.mData, host_c.mData, "Error: Incorrect results c", 1e-3, 1e-3);

--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
@@ -10,7 +10,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/device_memory.hpp"
@@ -94,23 +94,18 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
 using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;

 // A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
-using DeviceNormalizeInstance =
-    ck::tensor_operation::device::Device5AryElementwise<EDataType,
-                                                        R0DataType,
-                                                        R1DataType,
-                                                        GammaDataType,
-                                                        BetaDataType,
-                                                        LayerNormOutDataType,
-                                                        NormalizeComputeDataType,
-                                                        NormalizeFunctor,
-                                                        2,
-                                                        8,
-                                                        8,  // scalarPerVector: gemm_out
-                                                        1,  // scalarPerVector: reduce_mean
-                                                        1,  // scalarPerVector: reduce_mean_square
-                                                        8,  // scalarPerVector: Gamma
-                                                        8,  // scalarPerVector: Beta
-                                                        8>; // scalarPerVector: LayerNorm_out
+using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+    ck::Tuple<EDataType,
+              R0DataType,
+              R1DataType,
+              GammaDataType,
+              BetaDataType>,         // x(gemm_out), mean, meansquare, gamma, beta
+    ck::Tuple<LayerNormOutDataType>, // y
+    NormalizeFunctor,
+    2,
+    8,                           // MPerthread
+    ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
+    ck::Sequence<8>>;            // scalarPerVector: y(layerNorm_out)

 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
    return HostTensorDescriptor(std::vector<std::size_t>({len}),
@@ -197,14 +192,9 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
    {
        for(int n = 0; n < N; ++n)
        {
-            NormalizeComputeDataType out_acc = 0;
-            layerNormInst(out_acc,
-                          ck::type_convert<NormalizeComputeDataType>(e_m_n(m, n)),
-                          ck::type_convert<NormalizeComputeDataType>(mean_m(m)),
-                          ck::type_convert<NormalizeComputeDataType>(meanSquare_m(m)),
-                          ck::type_convert<NormalizeComputeDataType>(gamma_n(n)),
-                          ck::type_convert<NormalizeComputeDataType>(beta_n(n)));
-            out_m_n(m, n) = ck::type_convert<LayerNormOutDataType>(out_acc);
+            LayerNormOutDataType out_val = 0;
+            layerNormInst(out_val, e_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n));
+            out_m_n(m, n) = out_val;
        }
    }
 }
@@ -339,28 +329,28 @@ int main()
                                        beta_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {layerNorm_device_buf.GetDeviceBuffer()};

-    auto normalize          = DeviceNormalizeInstance{};
-    auto normalize_invoker  = normalize.MakeInvoker();
-    auto normalize_argument = normalize.MakeArgument(input,
-                                                     output,
-                                                     {M, N},
-                                                     {StrideE, 1},
-                                                     {1, 0},
-                                                     {1, 0},
-                                                     {0, 1},
-                                                     {0, 1},
-                                                     {StrideE, 1},
-                                                     NormalizeFunctor{});
-
-    if(!normalize.IsSupportedArgument(normalize_argument))
+    std::array<ck::index_t, 2> xyLengths = {M, N};
+    std::array<ck::index_t, 2> xyStrides = {StrideE, 1};
+
+    auto normalize         = DeviceNormalizeInstance{};
+    auto normalize_invoker = normalize.MakeInvoker();
+    auto normalize_argument_ptr =
+        normalize.MakeArgumentPointer(xyLengths,
+                                      {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+                                      {xyStrides},
+                                      input,
+                                      output,
+                                      NormalizeFunctor{});
+
+    if(!normalize.IsSupportedArgument(normalize_argument_ptr.get()))
    {
-        throw std::runtime_error("The runtime parameters seems not supported by the "
-                                 "Device5AryElementwise instance, exiting!");
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device, exiting!");
    }

    // run kernel
    gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false});
-    normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, false});
+    normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});

    bool pass = true;
    {
@@ -396,7 +386,7 @@ int main()
        float gemm_reduce_mean_reduce_square_mean_ave_time =
            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
        float normalize_ave_time =
-            normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, time_kernel});
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});

        if(time_kernel)
            DumpGemmLayerNormPerf<ADataType,

--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
@@ -10,7 +10,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/device_memory.hpp"
@@ -91,23 +91,20 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
 using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;

 // A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
-using DeviceNormalizeInstance =
-    ck::tensor_operation::device::Device5AryElementwise<EDataType,
-                                                        R0DataType,
-                                                        R1DataType,
-                                                        GammaDataType,
-                                                        BetaDataType,
-                                                        LayerNormOutDataType,
-                                                        NormalizeComputeDataType,
-                                                        NormalizeFunctor,
-                                                        2,
-                                                        8,
-                                                        8,  // scalarPerVector: gemm_out
-                                                        1,  // scalarPerVector: reduce_mean
-                                                        1,  // scalarPerVector: reduce_mean_square
-                                                        8,  // scalarPerVector: Gamma
-                                                        8,  // scalarPerVector: Beta
-                                                        8>; // scalarPerVector: LayerNorm_out
+using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+    ck::Tuple<EDataType,
+              R0DataType,
+              R1DataType,
+              GammaDataType,
+              BetaDataType>,         // x(gemm_out), mean,
+                                     // meansquare,
+                                     // gamma, beta
+    ck::Tuple<LayerNormOutDataType>, // y
+    NormalizeFunctor,
+    2,
+    8,                           // MPerthread
+    ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
+    ck::Sequence<8>>;            // scalarPerVector: y(layerNorm_out)

 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
    return HostTensorDescriptor(std::vector<std::size_t>({len}),
@@ -139,7 +136,6 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
                         int M,
                         int N)
 {
-
    int StrideE = N;
    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
    Tensor<R0DataType> mean_m(f_host_tensor_descriptor1d(M, 1));
@@ -184,14 +180,9 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
    {
        for(int n = 0; n < N; ++n)
        {
-            NormalizeComputeDataType out_acc = 0;
-            layerNormInst(out_acc,
-                          ck::type_convert<NormalizeComputeDataType>(e_m_n(m, n)),
-                          ck::type_convert<NormalizeComputeDataType>(mean_m(m)),
-                          ck::type_convert<NormalizeComputeDataType>(meanSquare_m(m)),
-                          ck::type_convert<NormalizeComputeDataType>(gamma_n(n)),
-                          ck::type_convert<NormalizeComputeDataType>(beta_n(n)));
-            out_m_n(m, n) = ck::type_convert<LayerNormOutDataType>(out_acc);
+            LayerNormOutDataType out_val = 0;
+            layerNormInst(out_val, e_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n));
+            out_m_n(m, n) = out_val;
        }
    }
 }
@@ -314,28 +305,28 @@ int main()
                                        beta_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {layerNorm_device_buf.GetDeviceBuffer()};

-    auto normalize          = DeviceNormalizeInstance{};
-    auto normalize_invoker  = normalize.MakeInvoker();
-    auto normalize_argument = normalize.MakeArgument(input,
-                                                     output,
-                                                     {M, N},
-                                                     {StrideE, 1},
-                                                     {1, 0},
-                                                     {1, 0},
-                                                     {0, 1},
-                                                     {0, 1},
-                                                     {StrideE, 1},
-                                                     NormalizeFunctor{});
-
-    if(!normalize.IsSupportedArgument(normalize_argument))
+    std::array<ck::index_t, 2> xyLengths = {M, N};
+    std::array<ck::index_t, 2> xyStrides = {StrideE, 1};
+
+    auto normalize         = DeviceNormalizeInstance{};
+    auto normalize_invoker = normalize.MakeInvoker();
+    auto normalize_argument_ptr =
+        normalize.MakeArgumentPointer(xyLengths,
+                                      {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+                                      {xyStrides},
+                                      input,
+                                      output,
+                                      NormalizeFunctor{});
+
+    if(!normalize.IsSupportedArgument(normalize_argument_ptr.get()))
    {
-        throw std::runtime_error("The runtime parameters seems not supported by the "
-                                 "Device5AryElementwise instance, exiting!");
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device, exiting");
    }

    // run kernel
    gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false});
-    normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, false});
+    normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});

    bool pass = true;
    {
@@ -369,7 +360,7 @@ int main()
        float gemm_reduce_mean_reduce_square_mean_ave_time =
            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
        float normalize_ave_time =
-            normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, time_kernel});
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});

        if(time_kernel)
            DumpGemmLayerNormPerf<ADataType,

--- a/example/33_multiple_reduce/CMakeLists.txt
+++ b/example/33_multiple_reduce/CMakeLists.txt
+add_example_executable(example_dual_reduce_multiblock dual_reduce_multiblock.cpp)
+add_example_executable(example_dual_reduce_threadwise dual_reduce_threadwise.cpp)
--- a/example/33_multiple_reduce/README.md
+++ b/example/33_multiple_reduce/README.md
+# Instructions for ```example_dual_reduce```
+
+## Run ```example_dual_reduce_multiblock```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes) 
+./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1
+```
+
+Result
+```
+./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1                        
+launch_and_time_kernel: grid_dim {150, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 1.19529 ms, 201.499 GB/s, DeviceMultipleReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_1_InSrcVectorSize_1,OutDstVectorSize_1_1>
+```
+
+## Run ```example_dual_reduce_threadwise```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes)
+./bin/example_dual_reduce_multiblock -D 8000,4,4,4 -v 1 2 1
+```
+
+Result
+```
+./bin/example_dual_reduce_threadwise -D 8000,4,4,4 -v 1 2 1
+launch_and_time_kernel: grid_dim {32, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 0.01512 ms, 71.9577 GB/s, DeviceMultipleReduceThreadwise<256,M_C256_S1,K_C1_S4,InSrcVectorDim_1_InSrcVectorSize_2,OutDstVectorSize_1_1>
+```
--- a/example/33_multiple_reduce/dual_reduce_common.hpp
+++ b/example/33_multiple_reduce/dual_reduce_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths = {600, 28, 28, 256};
+    size_t n, h, w, c;
+
+    bool do_verification = true;
+    int init_method      = 2;
+    bool time_kernel     = true;
+
+    public:
+    SimpleAppArgs()
+    {
+        n = inLengths[0];
+        h = inLengths[1];
+        w = inLengths[2];
+        c = inLengths[3];
+    };
+
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                if(inLengths.size() != 4)
+                    throw std::runtime_error(
+                        "Invalid option format! The number of integers is incorrect!");
+
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        n = inLengths[0];
+        h = inLengths[1];
+        w = inLengths[2];
+        c = inLengths[3];
+
+        return (0);
+    };
+};
+
+template <typename InDataType, typename OutDataType1, typename OutDataType2, typename AccDataType>
+static void mean_meansquare_host(const Tensor<InDataType>& in,
+                                 Tensor<OutDataType1>& mean_ref,
+                                 Tensor<OutDataType2>& meansquare_ref,
+                                 size_t n,
+                                 size_t h,
+                                 size_t w,
+                                 size_t c)
+
+{
+    auto thread_reduce_func = [&](auto iN) {
+        AccDataType mean       = ck::type_convert<AccDataType>(0.0f);
+        AccDataType meansquare = ck::type_convert<AccDataType>(0.0f);
+
+        // compute mean, meanquare, variance, invVariance
+        for(std::size_t iH = 0; iH < h; iH++)
+        {
+            for(std::size_t iW = 0; iW < w; iW++)
+            {
+                for(std::size_t iC = 0; iC < c; iC++)
+                {
+                    AccDataType curr_value = ck::type_convert<AccDataType>(in(iN, iH, iW, iC));
+
+                    mean += curr_value;
+                    meansquare += curr_value * curr_value;
+                };
+            }
+        };
+
+        mean       = mean / (h * w * c);
+        meansquare = meansquare / (h * w * c);
+
+        mean_ref(iN)       = ck::type_convert<OutDataType1>(mean);
+        meansquare_ref(iN) = ck::type_convert<OutDataType2>(meansquare);
+    };
+
+    std::size_t num_thread      = std::thread::hardware_concurrency();
+    std::size_t work_per_thread = (n + num_thread - 1) / num_thread;
+
+    std::vector<joinable_thread> threads(num_thread);
+
+    for(std::size_t it = 0; it < num_thread; it++)
+    {
+        std::size_t iN_begin = it * work_per_thread;
+        std::size_t iN_end   = std::min(static_cast<size_t>((it + 1) * work_per_thread), n);
+
+        auto f = [=] {
+            for(std::size_t iN = iN_begin; iN < iN_end; iN++)
+            {
+                thread_reduce_func(iN);
+            }
+        };
+
+        threads[it] = joinable_thread(f);
+    }
+};
+
+using ReduceOperation = ck::reduce::Add;
+
+using InElementwiseOperation_Mean  = ck::tensor_operation::element_wise::PassThrough;
+using AccElementwiseOperation_Mean = ck::tensor_operation::element_wise::UnaryDivide;
+
+using InElementwiseOperation_Meansquare  = ck::tensor_operation::element_wise::UnarySquare;
+using AccElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnaryDivide;
+
+using InElementwiseOperationTuple =
+    ck::Tuple<InElementwiseOperation_Mean, InElementwiseOperation_Meansquare>;
+using AccElementwiseOperationTuple =
+    ck::Tuple<AccElementwiseOperation_Mean, AccElementwiseOperation_Meansquare>;
+
+template <typename DeviceDualReduce,
+          typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          int Rank,
+          int NumReduceDim>
+int mean_meansquare_dual_reduce_test(size_t n,
+                                     size_t h,
+                                     size_t w,
+                                     size_t c,
+                                     bool do_verification,
+                                     int init_method,
+                                     bool time_kernel,
+                                     const std::array<int, NumReduceDim> reduceDims)
+{
+    const std::vector<size_t> inLengths = {n, h, w, c};
+
+    Tensor<InDataType> in(inLengths);
+
+    std::vector<size_t> outLengths{n};
+
+    Tensor<OutDataType> mean_ref(outLengths);
+    Tensor<OutDataType> mean(outLengths);
+    Tensor<OutDataType> meansquare_ref(outLengths);
+    Tensor<OutDataType> meansquare(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = mean.mDesc.GetStrides();
+
+    size_t invariant_total_length = n;
+    size_t reduce_total_length    = h * w * c;
+
+    const AccDataType alpha = ck::type_convert<AccDataType>(1.0f);
+    const AccDataType beta  = ck::type_convert<AccDataType>(0.0f);
+
+    std::size_t num_thread = 1;
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1: in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread); break;
+        case 2: in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread); break;
+        default: in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+        }
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(OutDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem meansquare_dev(sizeof(OutDataType) * meansquare.mDesc.GetElementSpaceSize());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(do_verification)
+    {
+        mean_meansquare_host<InDataType, OutDataType, OutDataType, AccDataType>(
+            in, mean_ref, meansquare_ref, n, h, w, c);
+    };
+
+    constexpr ck::index_t NumInputDim  = Rank;
+    constexpr ck::index_t NumOutputDim = (Rank - NumReduceDim > 1) ? Rank - NumReduceDim : 1;
+
+    std::array<ck::index_t, NumInputDim> i_inLengths;
+    std::array<ck::index_t, NumInputDim> i_inStrides;
+    std::array<ck::index_t, NumOutputDim> i_outLengths;
+    std::array<ck::index_t, NumOutputDim> i_outStrides;
+
+    std::copy(inLengths.begin(), inLengths.end(), i_inLengths.begin());
+    std::copy(inStrides.begin(), inStrides.end(), i_inStrides.begin());
+    std::copy(outLengths.begin(), outLengths.end(), i_outLengths.begin());
+    std::copy(outStrides.begin(), outStrides.end(), i_outStrides.begin());
+
+    auto dual_reduce_op = DeviceDualReduce{};
+
+    auto argument_ptr = dual_reduce_op.MakeArgumentPointer(
+        i_inLengths,
+        i_inStrides,
+        i_outLengths,
+        {i_outStrides, i_outStrides},
+        reduceDims,
+        {&alpha, &alpha},
+        {&beta, &beta},
+        in_dev.GetDeviceBuffer(),
+        {mean_dev.GetDeviceBuffer(), meansquare_dev.GetDeviceBuffer()},
+        ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}),
+        ck::make_tuple(
+            AccElementwiseOperation_Mean{static_cast<int32_t>(reduce_total_length)},
+            AccElementwiseOperation_Meansquare{static_cast<int32_t>(reduce_total_length)}));
+
+    if(!dual_reduce_op.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+        return (-1);
+    };
+
+    std::string reduce_name = dual_reduce_op.GetTypeString();
+
+    auto invoker_ptr = dual_reduce_op.MakeInvokerPointer();
+
+    float avg_time = 0.0f;
+
+    avg_time += invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
+                            2 * invariant_total_length * sizeof(OutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        mean_dev.FromDevice(mean.mData.data());
+        meansquare_dev.FromDevice(meansquare.mData.data());
+        pass = pass && ck::utils::check_err(mean.mData, mean_ref.mData);
+        pass = pass && ck::utils::check_err(meansquare.mData, meansquare_ref.mData);
+    };
+
+    return (pass ? 0 : 1);
+}
--- a/example/33_multiple_reduce/dual_reduce_multiblock.cpp
+++ b/example/33_multiple_reduce/dual_reduce_multiblock.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "dual_reduce_common.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InDataType       = ck::half_t;
+using OutDataType      = float;
+using OutDataTypeTuple = Tuple<OutDataType, OutDataType>;
+using AccDataType      = float;
+
+// for NHWC layer-norm calculation of mean and meansquare
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+constexpr bool PropagateNan = false;
+
+constexpr InMemoryDataOperationEnum OutMemoryDataOperation = InMemoryDataOperationEnum::Set;
+
+using DeviceDualReduce = DeviceMultipleReduceMultiBlock<2,
+                                                        InDataType,
+                                                        AccDataType,
+                                                        OutDataTypeTuple,
+                                                        Rank,
+                                                        NumReduceDim,
+                                                        ReduceOperation,
+                                                        InElementwiseOperationTuple,
+                                                        AccElementwiseOperationTuple,
+                                                        OutMemoryDataOperation,
+                                                        PropagateNan,
+                                                        256,
+                                                        4,
+                                                        64,
+                                                        1,
+                                                        1,
+                                                        1, // InSrcVectorDim
+                                                        1,
+                                                        ck::Sequence<1, 1>>;
+
+int main(int argc, char* argv[])
+{
+    int retval = 0;
+
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
+
+        retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  AccDataType,
+                                                  Rank,
+                                                  NumReduceDim>(arg.n,
+                                                                arg.h,
+                                                                arg.w,
+                                                                arg.c,
+                                                                arg.do_verification,
+                                                                arg.init_method,
+                                                                arg.time_kernel,
+                                                                reduceDims);
+    }
+    else
+    {
+        std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
+
+        retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  AccDataType,
+                                                  Rank,
+                                                  NumReduceDim>(
+            600, 28, 28, 256, true, 2, true, reduceDims);
+    };
+
+    return (retval);
+}
--- a/example/33_multiple_reduce/dual_reduce_threadwise.cpp
+++ b/example/33_multiple_reduce/dual_reduce_threadwise.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "dual_reduce_common.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InDataType       = ck::half_t;
+using OutDataType      = float;
+using OutDataTypeTuple = Tuple<OutDataType, OutDataType>;
+using AccDataType      = float;
+
+// for NHWC layer-norm calculation of mean and meansquare
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+constexpr bool PropagateNan = false;
+
+using DeviceDualReduce = DeviceMultipleReduceThreadWise<2,
+                                                        InDataType,
+                                                        AccDataType,
+                                                        OutDataTypeTuple,
+                                                        Rank,
+                                                        NumReduceDim,
+                                                        ReduceOperation,
+                                                        InElementwiseOperationTuple,
+                                                        AccElementwiseOperationTuple,
+                                                        PropagateNan,
+                                                        256,
+                                                        1,
+                                                        4,
+                                                        1, // InSrcVectorDim
+                                                        2,
+                                                        ck::Sequence<1, 1>>;
+
+int main(int argc, char* argv[])
+{
+    int retval = 0;
+
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
+
+        retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  AccDataType,
+                                                  Rank,
+                                                  NumReduceDim>(arg.n,
+                                                                arg.h,
+                                                                arg.w,
+                                                                arg.c,
+                                                                arg.do_verification,
+                                                                arg.init_method,
+                                                                arg.time_kernel,
+                                                                reduceDims);
+    }
+    else
+    {
+        std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
+
+        retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  AccDataType,
+                                                  Rank,
+                                                  NumReduceDim>(
+            8000, 4, 4, 4, true, 2, true, reduceDims);
+    };
+
+    return (retval);
+}
--- a/example/34_batchnorm/CMakeLists.txt
+++ b/example/34_batchnorm/CMakeLists.txt
+add_example_executable(example_batchnorm_forward batchnorm_forward_nhwc.cpp)
+add_example_executable(example_batchnorm_infer batchnorm_infer_nhwc.cpp)
--- a/example/34_batchnorm/README.md
+++ b/example/34_batchnorm/README.md
+# Instructions for ```batchnorm nhwc``` Example
+
+## Run ```batchnorm forward nhwc```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1:  data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
+#arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)
+#arg3: 1/0 to indicate whether to save result mean/invVariance (0=no, 1=yes)
+#arg4: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg5: time kernel (0=no, 1=yes) 
+./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
+```
+
+Result 
+```
+./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
+launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 2.08231 ms, 354.519 GB/s
+```
+
+Result
+```
+./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 1 0 2 0
+echo $?
+0
+```
+
+## Run ```batchnorm infer nhwc```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1:  data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
+```
+
+Result
+```
+./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
+launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 1.28235 ms, 523.329 GB/s
+```
+
+
--- a/example/34_batchnorm/batchnorm_common.hpp
+++ b/example/34_batchnorm/batchnorm_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cassert>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include "ck/utility/data_type.hpp"
+
+// binary operation used to calculate invVariance from mean and meansquare
+struct InvVariance
+{
+    InvVariance(double epsilon) : epsilon_(epsilon){};
+
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& mean, const T& meansquare) const
+    {
+        static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        T tmp_epsilon = type_convert<T>(epsilon_);
+
+        y = meansquare - mean * mean;
+        y = 1.0f / sqrt(tmp_epsilon + y);
+    };
+
+    double epsilon_;
+};
+
+// (4-in, 2-out) element-wise operation used to update the moving average of mean and variance
+struct MovingAverage
+{
+    MovingAverage(double factor) : factor_(factor){};
+
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y0,
+                                                  T& y1,
+                                                  const T& mean,
+                                                  const T& runningMean,
+                                                  const T& meansquare,
+                                                  const T& runningVariance) const
+    {
+        static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+
+        using ck::type_convert;
+
+        T tmp_factor = type_convert<T>(factor_);
+        T variance   = meansquare - mean * mean;
+
+        y0 = runningMean * (type_convert<T>(1.0f) - tmp_factor) + mean * tmp_factor;
+        y1 = runningVariance * (type_convert<T>(1.0f) - tmp_factor) + variance * tmp_factor;
+    };
+
+    double factor_;
+};
+
+struct MovingAverageAndInvVariance
+{
+    MovingAverageAndInvVariance(double epsilon, double factor)
+        : epsilon_(epsilon), factor_(factor){};
+
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y0, // resultRunningMean
+                                                  T& y1, // resultRunningVariance
+                                                  T& y2, // saveInvVariance
+                                                  const T& mean,
+                                                  const T& runningMean,
+                                                  const T& meansquare,
+                                                  const T& runningVariance) const
+    {
+        static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        T tmp_epsilon = type_convert<T>(epsilon_);
+        T tmp_factor  = type_convert<T>(factor_);
+        T variance    = meansquare - mean * mean;
+
+        y0 = runningMean * (type_convert<T>(1.0f) - tmp_factor) + mean * tmp_factor;
+        y1 = runningVariance * (type_convert<T>(1.0f) - tmp_factor) + variance * tmp_factor;
+
+        y2 = 1.0f / sqrt(tmp_epsilon + variance);
+    };
+
+    double epsilon_;
+    double factor_;
+};
+
+struct NormalizeInInfer
+{
+    NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
+
+    template <typename T1, typename T2>
+    __host__ __device__ constexpr void operator()(T1& y,
+                                                  const T1& x,
+                                                  const T2& mean,
+                                                  const T2& variance,
+                                                  const T2& gamma,
+                                                  const T2& beta) const
+    {
+        static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
+                      "Data type is not supported by this operation!");
+
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        T2 tmp_x, tmp_y;
+
+        tmp_x = type_convert<T2>(x);
+
+        tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) * gamma + beta;
+        y     = type_convert<T1>(tmp_y);
+    };
+
+    double epsilon_;
+};
+
+struct NormalizeInForward
+{
+    NormalizeInForward(double epsilon = 1e-4) : epsilon_(epsilon) {}
+
+    template <typename T1, typename T2>
+    __host__ __device__ constexpr void operator()(T1& y,
+                                                  const T1& x,
+                                                  const T2& mean,
+                                                  const T2& meansquare,
+                                                  const T2& gamma,
+                                                  const T2& beta) const
+    {
+        static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
+                      "Data type is not supported by this operation!");
+
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        T2 tmp_x, tmp_y;
+        T2 variance = meansquare - mean * mean;
+
+        tmp_x = type_convert<T2>(x);
+
+        tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) * gamma + beta;
+        y     = type_convert<T1>(tmp_y);
+    };
+
+    double epsilon_;
+};
+
+template <int Rank, int NumReduceDim>
+static inline std::array<int, Rank - NumReduceDim>
+get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
+{
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    std::array<int, Rank - NumReduceDim> invariantDims;
+
+    // collect invariant dimensions
+    int dim = 0;
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            invariantDims[dim] = i;
+            dim++;
+        };
+
+    return invariantDims;
+};
--- a/example/34_batchnorm/batchnorm_forward_impl.hpp
+++ b/example/34_batchnorm/batchnorm_forward_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cassert>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
+
+#include "batchnorm_common.hpp"
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ck::index_t Rank,
+          ck::index_t NumBatchNormReduceDim,
+          bool fastest_dim_is_reduced = false>
+int bnorm_fwd(bool time_kernel,
+              bool updateMovingAverage,
+              bool saveMeanAndInvVariance,
+              const std::array<int, NumBatchNormReduceDim> reduceDims,
+              const std::array<ck::index_t, Rank> xyLengths,
+              const std::array<ck::index_t, Rank> xStrides,
+              const std::array<ck::index_t, Rank> yStrides,
+              const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+              const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
+              const void* p_x,
+              const void* p_scale,
+              const void* p_bias,
+              void* p_y,
+              double exponentialAverageFactor,
+              void* p_runningMean,
+              void* p_runningVariance,
+              double epsilon,
+              void* p_saveMean,
+              void* p_saveInvVariance,
+              void* p_tmp_mean,
+              void* p_tmp_meansquare)
+{
+    static_assert(NumBatchNormReduceDim < Rank,
+                  "Invalid number of reduced dimensions for batchnorm!");
+
+    constexpr ck::index_t NumScaleBiasMeanVarDim = Rank - NumBatchNormReduceDim;
+
+    using InElementwiseOperation_Mean  = ck::tensor_operation::element_wise::PassThrough;
+    using AccElementwiseOperation_Mean = ck::tensor_operation::element_wise::UnaryDivide;
+
+    using InElementwiseOperation_Meansquare  = ck::tensor_operation::element_wise::UnarySquare;
+    using AccElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnaryDivide;
+
+    using DeviceMeanAndMeansquareInstance =
+        ck::tensor_operation::device::DeviceMultipleReduceMultiBlock<
+            2,
+            InOutDataType,
+            AccDataType,
+            ck::Tuple<AccDataType, AccDataType>,
+            Rank,
+            NumBatchNormReduceDim,
+            ck::reduce::Add,
+            ck::Tuple<InElementwiseOperation_Mean, InElementwiseOperation_Meansquare>,
+            ck::Tuple<AccElementwiseOperation_Mean, AccElementwiseOperation_Meansquare>,
+            ck::InMemoryDataOperationEnum::Set,
+            false, // PropagateNan
+            256,
+            16,
+            16,
+            1,
+            1,
+            fastest_dim_is_reduced ? 1 : 0,
+            1,
+            ck::Sequence<1, 1>>;
+
+    using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<InOutDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
+                                                                                      // meansquare,
+                                                                                      // scale, bias
+        ck::Tuple<InOutDataType>,                                                     // y
+        NormalizeInForward,
+        Rank,
+        2,                           // MPerthread
+        ck::Sequence<1, 1, 1, 1, 1>, // scalarPerVector: x, mean, meansquare, scale, bias
+        ck::Sequence<1>>;            // scalarPerVector: y
+
+    using DeviceInvVarianceInstance = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<AccDataType, AccDataType>, // mean, meansquare
+        ck::Tuple<AccDataType>,              // invVariance
+        InvVariance,
+        NumScaleBiasMeanVarDim,
+        2,                  // MPerthread
+        ck::Sequence<1, 1>, // scalarPerVector: mean, meansquare
+        ck::Sequence<1>>;   // scalarPerVector: invVariance
+
+    using DeviceMovingAverageInstance = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<AccDataType, AccDataType, AccDataType, AccDataType>, // old moving mean, new mean,
+                                                                       // old moving variance, new
+                                                                       // meansquare
+        ck::Tuple<AccDataType, AccDataType>, // updated moving mean, updated moving variance
+        MovingAverage,
+        NumScaleBiasMeanVarDim,
+        4,                        // MPerthread
+        ck::Sequence<1, 1, 1, 1>, // scalarPerVector: old moving mean, new mean, old moving
+                                  // variance, new meansquare
+        ck::Sequence<1, 1>>;      // scalarPerVector: updated moving mean, updated moving variance
+
+    using DeviceMovingAverageAndInvVarianceInstance =
+        ck::tensor_operation::device::DeviceElementwise<
+            ck::Tuple<AccDataType, AccDataType, AccDataType, AccDataType>, // old moving mean, new
+                                                                           // mean, old moving
+                                                                           // variance, new
+                                                                           // meansquare
+            ck::Tuple<AccDataType, AccDataType, AccDataType>, // updated moving mean, updated moving
+                                                              // variancem, invVariance
+            MovingAverageAndInvVariance,
+            NumScaleBiasMeanVarDim,
+            4,                        // MPerthread
+            ck::Sequence<1, 1, 1, 1>, // scalarPerVector: old moving mean, new mean, old moving
+                                      // variance, new meansquare
+            ck::Sequence<1, 1, 1>>; // scalarPerVector: updated moving mean, updated moving variance
+
+    auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
+    std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
+
+    int i = 0;
+    for(auto dim : invariantDims)
+    {
+        assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
+
+        aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i];
+        i++;
+    };
+
+    int32_t reduceLength = 1;
+
+    for(auto dim : reduceDims)
+        reduceLength *= xyLengths[dim];
+
+    int32_t invariantLength = 1;
+
+    for(auto dim : invariantDims)
+        invariantLength *= xyLengths[dim];
+
+    size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
+
+    float avg_time        = 0.0f;
+    std::size_t num_bytes = 0;
+
+    auto dev_mean_and_meansquare = DeviceMeanAndMeansquareInstance{};
+
+    void* p_mean = saveMeanAndInvVariance ? p_saveMean : p_tmp_mean;
+
+    const AccDataType alpha = ck::type_convert<AccDataType>(1.0f);
+    const AccDataType beta  = ck::type_convert<AccDataType>(0.0f);
+
+    auto argument_ptr1 = dev_mean_and_meansquare.MakeArgumentPointer(
+        xyLengths,
+        xStrides,
+        bnScaleBiasMeanVarLengths,
+        {bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
+        reduceDims,
+        {&alpha, &alpha},
+        {&beta, &beta},
+        p_x,
+        {p_mean, p_tmp_meansquare},
+        ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}),
+        ck::make_tuple(AccElementwiseOperation_Mean{reduceLength},
+                       AccElementwiseOperation_Meansquare{reduceLength}));
+
+    auto dev_normalize = DeviceNormalizeInstance{};
+
+    auto argument_ptr2 =
+        dev_normalize.MakeArgumentPointer(xyLengths,
+                                          {xStrides,
+                                           aligned_scaleBiasMeanVarStrides,
+                                           aligned_scaleBiasMeanVarStrides,
+                                           aligned_scaleBiasMeanVarStrides,
+                                           aligned_scaleBiasMeanVarStrides},
+                                          {yStrides},
+                                          {p_x, p_mean, p_tmp_meansquare, p_scale, p_bias},
+                                          {p_y},
+                                          NormalizeInForward{epsilon});
+
+    if(!dev_mean_and_meansquare.IsSupportedArgument(argument_ptr1.get()) ||
+       !dev_normalize.IsSupportedArgument(argument_ptr2.get()))
+    {
+        std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
+                  << std::endl;
+
+        return (-1);
+    };
+
+    auto invoker_ptr1 = dev_mean_and_meansquare.MakeInvokerPointer();
+    auto invoker_ptr2 = dev_normalize.MakeInvokerPointer();
+
+    avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
+    avg_time += invoker_ptr2->Run(argument_ptr2.get(), StreamConfig{nullptr, time_kernel});
+
+    num_bytes +=
+        (total_length * sizeof(InOutDataType) + invariantLength * 2 * sizeof(AccDataType)) + // No.1
+        (total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) +
+         total_length * sizeof(InOutDataType)); // No.2
+
+    if(saveMeanAndInvVariance && updateMovingAverage)
+    {
+        auto dev_moving_average_inv_variance = DeviceMovingAverageAndInvVarianceInstance{};
+
+        auto argument_ptr3 = dev_moving_average_inv_variance.MakeArgumentPointer(
+            bnScaleBiasMeanVarLengths,
+            {bnScaleBiasMeanVarStrides,
+             bnScaleBiasMeanVarStrides,
+             bnScaleBiasMeanVarStrides,
+             bnScaleBiasMeanVarStrides},
+            {bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
+            {p_mean, p_runningMean, p_tmp_meansquare, p_runningVariance},
+            {p_runningMean, p_runningVariance, p_saveInvVariance},
+            MovingAverageAndInvVariance{epsilon, exponentialAverageFactor});
+
+        if(!dev_moving_average_inv_variance.IsSupportedArgument(argument_ptr3.get()))
+        {
+            std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
+
+            return (-1);
+        };
+
+        auto invoker_ptr3 = dev_moving_average_inv_variance.MakeInvokerPointer();
+
+        avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
+
+        num_bytes += invariantLength * (4 + 3) * sizeof(AccDataType) * 2; // No.5
+    }
+    else if(saveMeanAndInvVariance)
+    {
+        auto dev_inv_variance = DeviceInvVarianceInstance{};
+        auto argument_ptr3    = dev_inv_variance.MakeArgumentPointer(
+            bnScaleBiasMeanVarLengths,
+            {bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
+            {bnScaleBiasMeanVarStrides},
+            {p_mean, p_tmp_meansquare},
+            {p_saveInvVariance},
+            InvVariance{epsilon});
+
+        if(!dev_inv_variance.IsSupportedArgument(argument_ptr3.get()))
+        {
+            std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
+
+            return (-1);
+        };
+
+        auto invoker_ptr3 = dev_inv_variance.MakeInvokerPointer();
+
+        avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
+
+        num_bytes += invariantLength * (2 + 1) * sizeof(AccDataType);
+    }
+    else if(updateMovingAverage)
+    {
+        auto dev_moving_average = DeviceMovingAverageInstance{};
+
+        auto argument_ptr3 = dev_moving_average.MakeArgumentPointer(
+            bnScaleBiasMeanVarLengths,
+            {bnScaleBiasMeanVarStrides,
+             bnScaleBiasMeanVarStrides,
+             bnScaleBiasMeanVarStrides,
+             bnScaleBiasMeanVarStrides},
+            {bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
+            {p_mean, p_runningMean, p_tmp_meansquare, p_runningVariance},
+            {p_runningMean, p_runningVariance},
+            MovingAverage{exponentialAverageFactor});
+
+        if(!dev_moving_average.IsSupportedArgument(argument_ptr3.get()))
+        {
+            std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
+
+            return (-1);
+        };
+
+        auto invoker_ptr3 = dev_moving_average.MakeInvokerPointer();
+
+        avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
+
+        num_bytes += invariantLength * (4 + 2) * sizeof(AccDataType) * 2; // No.5
+    };
+
+    if(time_kernel)
+    {
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    };
+
+    return (0);
+};
--- a/example/34_batchnorm/batchnorm_forward_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_nhwc.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp"
+
+#include "batchnorm_forward_impl.hpp"
+
+template <typename InOutDataType, typename AccDataType>
+using ReferenceBatchNormFwdInstance =
+    ck::tensor_operation::host::ReferenceBatchNormFwd_Input_N_H_W_C_Output_C<InOutDataType,
+                                                                             AccDataType>;
+
+static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class BatchNormFwdArg
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inOutLengths;
+
+    bool do_verification = false;
+
+    bool updateMovingAverage;
+    bool saveMeanAndInvVariance;
+
+    int data_type    = 0;
+    int init_method  = 2;
+    bool time_kernel = false;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
+                     "lengths, must have 4 integers for nhwc"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
+                     "result by "
+                     "comparing with the host-based batch-normalization"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance "
+                     "(0=no, 1=yes)"
+                  << std::endl;
+        std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance "
+                     "(0=no, 1=yes)"
+                  << std::endl;
+        std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer "
+                     "value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inOutLengths = getTypeValuesFromString<size_t>(optarg);
+
+                if(inOutLengths.size() != 4)
+                    throw std::runtime_error(
+                        "NHWC tensor layout should have 4 length values specified!");
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 5 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type              = std::atoi(argv[optind++]);
+        updateMovingAverage    = std::atoi(argv[optind++]);
+        saveMeanAndInvVariance = std::atoi(argv[optind++]);
+        init_method            = std::atoi(argv[optind++]);
+        time_kernel            = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return (-1);
+
+        return (0);
+    };
+};
+
+using namespace ck;
+
+template <typename InOutDataType, typename AccDataType>
+bool bnorm_fwd_nhwc_test(bool do_verification,
+                         int init_method,
+                         bool time_kernel,
+                         const std::vector<size_t> inOutLengths,
+                         bool updateMovingAverage,
+                         bool saveMeanAndInvVariance,
+                         double averageFactor,
+                         double epsilon)
+{
+    // for NHWC BatchNorm calculation of mean and meansquare
+    constexpr int Rank         = 4;
+    constexpr int NumReduceDim = 3;
+
+    const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
+
+    // input data of the batchnorm forward algorithm
+    Tensor<InOutDataType> x(inOutLengths);
+    Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
+
+    // output data of the batchnorm forward algorithm
+    Tensor<InOutDataType> y_ref(inOutLengths);
+    Tensor<InOutDataType> y(inOutLengths);
+
+    Tensor<AccDataType> resultSaveMean_ref(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> resultSaveInvVariance_ref(scaleBiasMeanVarLengths);
+
+    Tensor<AccDataType> resultRunningMean_ref(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> resultRunningVariance_ref(scaleBiasMeanVarLengths);
+
+    auto inOutStrides            = x.mDesc.GetStrides();
+    auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(updateMovingAverage)
+    {
+        if constexpr(std::is_same<InOutDataType, int8_t>::value)
+        {
+            x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+
+            const float x_mean       = 0.0f;
+            const float x_stddev     = 2.5f;
+            const float noise_stddev = 0.04f;
+
+            resultRunningMean_ref.GenerateTensorValue(
+                GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
+
+            resultRunningVariance_ref.GenerateTensorValue(
+                GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+        }
+        else
+        {
+            const float x_mean       = 0.0f;
+            const float x_stddev     = 1.0f;
+            const float noise_stddev = 0.04f;
+
+            // input data in normal distribution
+            x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
+
+            // initialize the runningMean to be values with tiny variation to the mean of the x
+            // values
+            resultRunningMean_ref.GenerateTensorValue(
+                GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
+
+            // initialize the runningVariance to be values with tiny variation to the variance of
+            // the x values
+            resultRunningVariance_ref.GenerateTensorValue(
+                GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+        };
+    }
+    else
+    {
+        if constexpr(std::is_same<InOutDataType, int8_t>::value)
+            x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+        else
+            x.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0f, 5.0f}, num_thread);
+    };
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            break;
+        case 1:
+            bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
+            break;
+        case 2:
+            bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
+        }
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
+    DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
+
+    // mean_dev or resultSaveMean_dev
+    DeviceMem resultSaveMean_dev(sizeof(AccDataType) *
+                                 resultSaveMean_ref.mDesc.GetElementSpaceSize());
+    // meansquare_dev or resultSaveInvVariance_dev
+    DeviceMem resultSaveInvVariance_dev(sizeof(AccDataType) *
+                                        resultSaveInvVariance_ref.mDesc.GetElementSpaceSize());
+    // resultRunningMean_dev
+    DeviceMem resultRunningMean_dev(sizeof(AccDataType) *
+                                    resultRunningMean_ref.mDesc.GetElementSpaceSize());
+    // resultRunningVariance_dev
+    DeviceMem resultRunningVariance_dev(sizeof(AccDataType) *
+                                        resultRunningVariance_ref.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    bnScale_dev.ToDevice(bnScale.mData.data());
+    bnBias_dev.ToDevice(bnBias.mData.data());
+
+    if(updateMovingAverage)
+    {
+        resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data());
+        resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data());
+    };
+
+    std::array<index_t, Rank> i_inOutLengths;
+    std::array<index_t, Rank> i_inOutStrides;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
+
+    std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
+    std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
+    std::copy(scaleBiasMeanVarLengths.begin(),
+              scaleBiasMeanVarLengths.end(),
+              i_scaleBiasMeanVarLengths.begin());
+    std::copy(scaleBiasMeanVarStrides.begin(),
+              scaleBiasMeanVarStrides.end(),
+              i_scaleBiasMeanVarStrides.begin());
+
+    int result = 0;
+
+    // used for saving meansquare
+    DeviceMem workspace(sizeof(AccDataType) * 2 * resultSaveMean_ref.mDesc.GetElementSpaceSize() +
+                        128);
+
+    void* p_tmp_mean = workspace.GetDeviceBuffer();
+    void* p_tmp_meansquare =
+        static_cast<char*>(p_tmp_mean) +
+        (sizeof(AccDataType) * resultSaveMean_ref.mDesc.GetElementSpaceSize() + 63) / 64 * 64;
+
+    result = bnorm_fwd<InOutDataType, AccDataType, Rank, NumReduceDim, false>(
+        time_kernel,
+        updateMovingAverage,
+        saveMeanAndInvVariance,
+        {0, 1, 2},
+        i_inOutLengths,
+        i_inOutStrides,
+        i_inOutStrides,
+        i_scaleBiasMeanVarLengths,
+        i_scaleBiasMeanVarStrides,
+        x_dev.GetDeviceBuffer(),
+        bnScale_dev.GetDeviceBuffer(),
+        bnBias_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+        averageFactor,
+        updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr,
+        updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr,
+        epsilon,
+        saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr,
+        saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr,
+        p_tmp_mean,
+        p_tmp_meansquare);
+
+    if(result < 0)
+        return (false);
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        auto batchNormFwd_ref = ReferenceBatchNormFwdInstance<InOutDataType, AccDataType>{};
+
+        auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer(
+            i_inOutLengths,
+            i_inOutStrides,
+            i_inOutStrides,
+            i_scaleBiasMeanVarLengths,
+            i_scaleBiasMeanVarStrides,
+            x.mData.data(),
+            bnScale.mData.data(),
+            bnBias.mData.data(),
+            y_ref.mData.data(),
+            0.1, // exponentialAverageFactor
+            updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr, // resultRunningMean
+            updateMovingAverage ? resultRunningVariance_ref.mData.data()
+                                : nullptr, // resultRunningVariance
+            epsilon,
+            saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr,
+            saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr);
+
+        if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout
+                << "The runtime parameters seems not supported by the BatchNorm instance, exiting!"
+                << std::endl;
+            return (-2);
+        };
+
+        auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+
+        y_dev.FromDevice(y.mData.data());
+        pass = pass && ck::utils::check_err(y.mData, y_ref.mData);
+
+        if(updateMovingAverage)
+        {
+            Tensor<AccDataType> resultRunningMean(scaleBiasMeanVarLengths);
+            Tensor<AccDataType> resultRunningVariance(scaleBiasMeanVarLengths);
+
+            resultRunningMean_dev.FromDevice(resultRunningMean.mData.data());
+            resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data());
+
+            pass =
+                pass && ck::utils::check_err(resultRunningMean.mData, resultRunningMean_ref.mData);
+            pass = pass && ck::utils::check_err(resultRunningVariance.mData,
+                                                resultRunningVariance_ref.mData);
+        };
+
+        if(saveMeanAndInvVariance)
+        {
+            Tensor<AccDataType> resultSaveMean(scaleBiasMeanVarLengths);
+            Tensor<AccDataType> resultSaveInvVariance(scaleBiasMeanVarLengths);
+
+            resultSaveMean_dev.FromDevice(resultSaveMean.mData.data());
+            resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data());
+
+            pass = pass && ck::utils::check_err(resultSaveMean.mData, resultSaveMean_ref.mData);
+            pass = pass && ck::utils::check_err(resultSaveInvVariance.mData,
+                                                resultSaveInvVariance_ref.mData);
+        };
+    };
+
+    return (pass);
+};
+
+const double epsilon              = std::numeric_limits<float>::epsilon();
+static const double averageFactor = 0.1;
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        BatchNormFwdArg arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            pass = bnorm_fwd_nhwc_test<ck::half_t, float>(arg.do_verification,
+                                                          arg.init_method,
+                                                          arg.time_kernel,
+                                                          arg.inOutLengths,
+                                                          arg.updateMovingAverage,
+                                                          arg.saveMeanAndInvVariance,
+                                                          averageFactor,
+                                                          epsilon);
+        }
+        else if(arg.data_type == 1)
+        {
+            pass = bnorm_fwd_nhwc_test<float, float>(arg.do_verification,
+                                                     arg.init_method,
+                                                     arg.time_kernel,
+                                                     arg.inOutLengths,
+                                                     arg.updateMovingAverage,
+                                                     arg.saveMeanAndInvVariance,
+                                                     averageFactor,
+                                                     epsilon);
+        }
+        else if(arg.data_type == 3)
+        {
+            pass = bnorm_fwd_nhwc_test<int8_t, float>(arg.do_verification,
+                                                      arg.init_method,
+                                                      arg.time_kernel,
+                                                      arg.inOutLengths,
+                                                      arg.updateMovingAverage,
+                                                      arg.saveMeanAndInvVariance,
+                                                      averageFactor,
+                                                      epsilon);
+        }
+        else if(arg.data_type == 5)
+        {
+            pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float>(arg.do_verification,
+                                                           arg.init_method,
+                                                           arg.time_kernel,
+                                                           arg.inOutLengths,
+                                                           arg.updateMovingAverage,
+                                                           arg.saveMeanAndInvVariance,
+                                                           averageFactor,
+                                                           epsilon);
+        }
+        else if(arg.data_type == 6)
+        {
+            pass = bnorm_fwd_nhwc_test<double, double>(arg.do_verification,
+                                                       arg.init_method,
+                                                       arg.time_kernel,
+                                                       arg.inOutLengths,
+                                                       arg.updateMovingAverage,
+                                                       arg.saveMeanAndInvVariance,
+                                                       averageFactor,
+                                                       epsilon);
+        }
+    }
+    else
+    {
+        pass = bnorm_fwd_nhwc_test<ck::half_t, float>(true,
+                                                      2,
+                                                      false, // don't time kernel
+                                                      {128, 16, 16, 1024},
+                                                      true,
+                                                      false,
+                                                      averageFactor,
+                                                      epsilon);
+    };
+
+    return (pass ? 0 : 1);
+}
--- a/example/34_batchnorm/batchnorm_infer_impl.hpp
+++ b/example/34_batchnorm/batchnorm_infer_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cassert>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
+
+#include "batchnorm_common.hpp"
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ck::index_t Rank,
+          ck::index_t NumBatchNormReduceDim,
+          bool fastest_dim_is_reduced = false>
+int bnorm_infer(
+    bool time_kernel,
+    const std::array<int, NumBatchNormReduceDim> reduceDims,
+    const std::array<ck::index_t, Rank> xyLengths,
+    const std::array<ck::index_t, Rank> xStrides,
+    const std::array<ck::index_t, Rank> yStrides,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
+    const void* p_x,
+    const void* p_scale,
+    const void* p_bias,
+    double epsilon,
+    const void* p_estimatedMean,
+    const void* p_estimatedVariance,
+    void* p_y)
+{
+    (void)bnScaleBiasMeanVarLengths;
+
+    static_assert(NumBatchNormReduceDim < Rank,
+                  "Invalid number of reduced dimensions for batchnorm!");
+
+    using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<InOutDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
+                                                                                      // variance,
+                                                                                      // scale,
+                                                                                      // bias,
+        ck::Tuple<InOutDataType>,                                                     // y
+        NormalizeInInfer,
+        Rank,
+        2,                           // MPerthread
+        ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias
+        ck::Sequence<1>>;            // scalarPerVector: y
+
+    auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
+    std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
+
+    int i = 0;
+    for(auto dim : invariantDims)
+    {
+        assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
+
+        aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i];
+        i++;
+    };
+
+    int32_t reduceLength = 1;
+
+    for(auto dim : reduceDims)
+        reduceLength *= xyLengths[dim];
+
+    int32_t invariantLength = 1;
+
+    for(auto dim : invariantDims)
+        invariantLength *= xyLengths[dim];
+
+    size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
+
+    float avg_time        = 0.0f;
+    std::size_t num_bytes = 0;
+
+    auto dev_normalize = DeviceNormalizeInstance{};
+
+    auto argument_ptr1 = dev_normalize.MakeArgumentPointer(
+        xyLengths,
+        {xStrides,
+         aligned_scaleBiasMeanVarStrides,
+         aligned_scaleBiasMeanVarStrides,
+         aligned_scaleBiasMeanVarStrides,
+         aligned_scaleBiasMeanVarStrides},
+        {yStrides},
+        {p_x, p_estimatedMean, p_estimatedVariance, p_scale, p_bias},
+        {p_y},
+        NormalizeInInfer{epsilon});
+
+    if(!dev_normalize.IsSupportedArgument(argument_ptr1.get()))
+    {
+        std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
+                  << std::endl;
+
+        return (-1);
+    };
+
+    auto invoker_ptr1 = dev_normalize.MakeInvokerPointer();
+
+    avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
+
+    num_bytes += (total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) +
+                  total_length * sizeof(InOutDataType));
+
+    if(time_kernel)
+    {
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    };
+
+    return (0);
+};
--- a/example/34_batchnorm/batchnorm_infer_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_infer_nhwc.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp"
+
+#include "batchnorm_infer_impl.hpp"
+
+template <typename InOutDataType, typename AccDataType>
+using ReferenceBatchNormInferInstance =
+    ck::tensor_operation::host::ReferenceBatchNormInfer_Input_N_H_W_C_Output_C<InOutDataType,
+                                                                               AccDataType>;
+
+static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class BatchNormInferArg
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inOutLengths;
+
+    bool do_verification = false;
+
+    int data_type    = 0;
+    int init_method  = 2;
+    bool time_kernel = false;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
+                     "lengths, must have 4 integers for nhwc"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
+                     "result by "
+                     "comparing with the host-based batch-normalization"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer "
+                     "value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inOutLengths = getTypeValuesFromString<size_t>(optarg);
+
+                if(inOutLengths.size() != 4)
+                    throw std::runtime_error(
+                        "NHWC tensor layout should have 4 length values specified!");
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 3 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return (-1);
+
+        return (0);
+    };
+};
+
+using namespace ck;
+
+template <typename InOutDataType, typename AccDataType>
+bool bnorm_infer_nhwc_test(bool do_verification,
+                           int init_method,
+                           bool time_kernel,
+                           const std::vector<size_t> inOutLengths,
+                           double epsilon)
+{
+    // for NHWC BatchNorm calculation of mean and meansquare
+    constexpr int Rank         = 4;
+    constexpr int NumReduceDim = 3;
+
+    const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
+
+    // input data of the batchnorm forward algorithm
+    Tensor<InOutDataType> x(inOutLengths);
+    Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
+
+    // output data of the batchnorm forward algorithm
+    Tensor<InOutDataType> y_ref(inOutLengths);
+    Tensor<InOutDataType> y(inOutLengths);
+
+    Tensor<AccDataType> estimatedMean(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> estimatedVariance(scaleBiasMeanVarLengths);
+
+    auto inOutStrides            = x.mDesc.GetStrides();
+    auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if constexpr(std::is_same<InOutDataType, int8_t>::value)
+    {
+        x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+
+        const float x_mean       = 0.0f;
+        const float x_stddev     = 2.5f;
+        const float noise_stddev = 0.0001f;
+
+        estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
+                                          num_thread);
+
+        estimatedVariance.GenerateTensorValue(
+            GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+    }
+    else
+    {
+        const float x_mean       = 0.0f;
+        const float x_stddev     = 1.0f;
+        const float noise_stddev = 0.0001f;
+
+        x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
+
+        // initialize the savedMean to be values with tiny variation to the mean of the x values
+        estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
+                                          num_thread);
+
+        // initialize the variance to be values with tiny variation to the variance of the x values
+        estimatedVariance.GenerateTensorValue(
+            GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+    };
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            break;
+        case 1:
+            bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
+            break;
+        case 2:
+            bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
+        }
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
+    DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
+
+    // mean_dev or resultSaveMean_dev
+    DeviceMem estimatedMean_dev(sizeof(AccDataType) * estimatedMean.mDesc.GetElementSpaceSize());
+    // meansquare_dev or resultSaveInvVariance_dev
+    DeviceMem estimatedVariance_dev(sizeof(AccDataType) *
+                                    estimatedVariance.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    bnScale_dev.ToDevice(bnScale.mData.data());
+    bnBias_dev.ToDevice(bnBias.mData.data());
+    estimatedMean_dev.ToDevice(estimatedMean.mData.data());
+    estimatedVariance_dev.ToDevice(estimatedVariance.mData.data());
+
+    using ck::index_t;
+
+    std::array<index_t, Rank> i_inOutLengths;
+    std::array<index_t, Rank> i_inOutStrides;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
+
+    std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
+    std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
+    std::copy(scaleBiasMeanVarLengths.begin(),
+              scaleBiasMeanVarLengths.end(),
+              i_scaleBiasMeanVarLengths.begin());
+    std::copy(scaleBiasMeanVarStrides.begin(),
+              scaleBiasMeanVarStrides.end(),
+              i_scaleBiasMeanVarStrides.begin());
+
+    int result = 0;
+
+    result = bnorm_infer<InOutDataType, AccDataType, Rank, NumReduceDim, false>(
+        time_kernel,
+        {0, 1, 2},
+        i_inOutLengths,
+        i_inOutStrides,
+        i_inOutStrides,
+        i_scaleBiasMeanVarLengths,
+        i_scaleBiasMeanVarStrides,
+        x_dev.GetDeviceBuffer(),
+        bnScale_dev.GetDeviceBuffer(),
+        bnBias_dev.GetDeviceBuffer(),
+        epsilon,
+        estimatedMean_dev.GetDeviceBuffer(),
+        estimatedVariance_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer());
+
+    if(result < 0)
+        return (false);
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        auto batchNormInfer_ref = ReferenceBatchNormInferInstance<InOutDataType, AccDataType>{};
+
+        auto argument_ptr_ref =
+            batchNormInfer_ref.MakeArgumentPointer(i_inOutLengths,
+                                                   i_inOutStrides,
+                                                   i_inOutStrides,
+                                                   i_scaleBiasMeanVarLengths,
+                                                   i_scaleBiasMeanVarStrides,
+                                                   x.mData.data(),
+                                                   bnScale.mData.data(),
+                                                   bnBias.mData.data(),
+                                                   epsilon,
+                                                   estimatedMean.mData.data(),
+                                                   estimatedVariance.mData.data(),
+                                                   y_ref.mData.data());
+
+        if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout
+                << "The runtime parameters seems not supported by the BatchNorm instance, exiting!"
+                << std::endl;
+            return (-2);
+        };
+
+        auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+
+        y_dev.FromDevice(y.mData.data());
+        pass = pass && ck::utils::check_err(y.mData, y_ref.mData);
+    };
+
+    return (pass);
+};
+
+static const double epsilon = std::numeric_limits<float>::epsilon();
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        BatchNormInferArg arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            pass = bnorm_infer_nhwc_test<ck::half_t, float>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        }
+        else if(arg.data_type == 1)
+        {
+            pass = bnorm_infer_nhwc_test<float, float>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        }
+        else if(arg.data_type == 3)
+        {
+            pass = bnorm_infer_nhwc_test<int8_t, float>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        }
+        else if(arg.data_type == 5)
+        {
+            pass = bnorm_infer_nhwc_test<ck::bhalf_t, float>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        }
+        else if(arg.data_type == 6)
+        {
+            pass = bnorm_infer_nhwc_test<double, double>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        };
+    }
+    else
+    {
+        pass = bnorm_infer_nhwc_test<ck::half_t, float>(true,
+                                                        2,
+                                                        false, // don't time kernel
+                                                        {128, 16, 16, 1024},
+                                                        epsilon);
+    };
+
+    return (pass ? 0 : 1);
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -47,3 +47,6 @@ add_subdirectory(29_batched_gemm_bias_e_permute)
 add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
 add_subdirectory(31_batched_gemm_gemm)
 add_subdirectory(32_batched_gemm_softmax_gemm)
+add_subdirectory(33_multiple_reduce)
+add_subdirectory(34_batchnorm)
+