Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320)

* Implement multiple-reduction in one kernel (kernels, device ops, examples) * Add generic elementwise kernel and device interface * Add generator for normal-distributed data initialization * Add host refer implementation of batchnorm-forward and batchnorm-infer * Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels * Remove un-needed including in batchnorm example * Renaming generic_elementwise to elementiwise in kernel and device classes/functions * Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise * Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise * Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise * Add DeviceElementwiseBase and use it in device_normalize_instance.cpp * Removing and renaming files * Update to synchronize gemm_layernorm client example to the generic element-wise device op API * Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming * Merge two static member functions in device_elementwise.hpp * Remove unary_elementwise_1d kernel and device

Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320)
* Implement multiple-reduction in one kernel (kernels, device ops, examples) * Add generic elementwise kernel and device interface * Add generator for normal-distributed data initialization * Add host refer implementation of batchnorm-forward and batchnorm-infer * Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels * Remove un-needed including in batchnorm example * Renaming generic_elementwise to elementiwise in kernel and device classes/functions * Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise * Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise * Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise * Add DeviceElementwiseBase and use it in device_normalize_instance.cpp * Removing and renaming files * Update to synchronize gemm_layernorm client example to the generic element-wise device op API * Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming * Merge two static member functions in device_elementwise.hpp * Remove unary_elementwise_1d kernel and device
53ea4713 · Qianfeng · GitHub · 5ee30459 · 53ea4713 · 5ee30459
Unverified Commit 53ea4713 authored Aug 15, 2022 by Qianfeng Committed by GitHub Aug 15, 2022
7 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <typename Grid1dBufferDescTuple,
+          index_t NumBuffer,
+          index_t BlockSize,
+          typename DataTypePointerTuple,
+          typename DataTypeTuple>
+__global__ void
+kernel_multiple_buffer_set_value(const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple,
+                                 DataTypePointerTuple p_global_tuple,
+                                 DataTypeTuple value_tuple)
+
+{
+    static_assert(NumBuffer == DataTypePointerTuple::Size() && NumBuffer == DataTypeTuple::Size(),
+                  "The tuple size should be same as NumBuffer!");
+
+    static_for<0, NumBuffer, 1>{}([&](auto iB) {
+        using DataTypePointer     = remove_cvref_t<decltype(DataTypePointerTuple{}[iB])>;
+        using DataTypeFromPointer = remove_pointer_t<DataTypePointer>;
+        using DataType            = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
+
+        static_assert(is_same<DataType, DataTypeFromPointer>::value,
+                      "Types in tuples does not match!");
+    });
+
+    constexpr auto I0 = Number<0>{};
+
+    const index_t thread_global_id = get_thread_global_1d_id();
+
+    auto value_buf_tuple = generate_tuple(
+        [&](auto iB) {
+            using DataType = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
+
+            return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, 1, true>{};
+        },
+        Number<NumBuffer>{});
+
+    static_for<0, NumBuffer, 1>{}([&](auto iB) {
+        static_for<0, 1, 1>{}([&](auto J) { value_buf_tuple(iB)(J) = value_tuple[iB]; });
+    });
+
+    auto global_buf_tuple = generate_tuple(
+        [&](auto iB) {
+            return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_global_tuple(iB), grid_1d_buffer_desc_tuple[iB].GetElementSpaceSize());
+        },
+        Number<NumBuffer>{});
+
+    constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+    static_for<0, NumBuffer, 1>{}([&](auto iB) {
+        using DataType      = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
+        using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+        auto threadwise_store =
+            ThreadwiseTensorSliceTransfer_v1r3<DataType,
+                                               DataType,
+                                               decltype(val_buff_desc),
+                                               decltype(Grid1dBufferDescTuple{}[iB]),
+                                               PassThroughOp,
+                                               Sequence<1>,
+                                               Sequence<0>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                grid_1d_buffer_desc_tuple[iB], make_multi_index(thread_global_id), PassThroughOp{});
+
+        threadwise_store.Run(val_buff_desc,
+                             make_tuple(I0),
+                             value_buf_tuple(iB),
+                             grid_1d_buffer_desc_tuple[iB],
+                             global_buf_tuple(iB));
+    });
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/utility/data_type.hpp"
-#include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-namespace ck {
-
-template <typename GridwiseUEltwise,
-          typename ADataType,
-          typename BDataType,
-          typename GridDesc_M0,
-          typename ElementwiseFunctor>
-__global__ void kernel_unary_elementwise_1d(const ADataType* __restrict__ p_a_global,
-                                            BDataType* __restrict__ p_b_global,
-                                            const GridDesc_M0 a_grid_desc_m0,
-                                            const GridDesc_M0 b_grid_desc_m0,
-                                            const ElementwiseFunctor functor)
-{
-    GridwiseUEltwise::Run(p_a_global, p_b_global, a_grid_desc_m0, b_grid_desc_m0, functor);
-}
-
-template <typename ADataType,
-          typename BDataType,
-          typename GridDesc_M0,
-          typename ElementwiseFunctor,
-          index_t ScalarPerVector>
-struct GridwiseUnaryElementwise_1D
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto thread_desc_m0 =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<ScalarPerVector>{}));
-
-    using PassThrough = tensor_operation::element_wise::PassThrough;
-
-    static __device__ auto CalculateElementwiseIndex()
-    {
-        const index_t global_thread_id = get_thread_global_1d_id();
-        return make_multi_index(global_thread_id * ScalarPerVector);
-    }
-
-    __host__ __device__ static constexpr bool CheckValidity(const GridDesc_M0 a_grid_desc_m0,
-                                                            const GridDesc_M0 b_grid_desc_m0)
-    {
-        return a_grid_desc_m0.GetLength(I0) == b_grid_desc_m0.GetLength(I0);
-    }
-
-    __host__ __device__ static constexpr index_t CalculateGridSize(const index_t tensor_size)
-    {
-        const index_t grid_size = math::integer_divide_ceil(tensor_size, 256 * ScalarPerVector);
-
-        return grid_size;
-    }
-
-    __device__ static void Run(const ADataType* __restrict__ p_a_global,
-                               BDataType* __restrict__ p_b_global,
-                               const GridDesc_M0 a_grid_desc_m0,
-                               const GridDesc_M0 b_grid_desc_m0,
-                               const ElementwiseFunctor functor)
-    {
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_global, a_grid_desc_m0.GetElementSpaceSize());
-        auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_global, b_grid_desc_m0.GetElementSpaceSize());
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, ScalarPerVector, true> a_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, ScalarPerVector, true> b_thread_buf;
-
-        const auto thread_store_global_offset = CalculateElementwiseIndex();
-
-        auto a_global_load =
-            ThreadwiseTensorSliceTransfer_v2<ADataType,
-                                             ADataType,
-                                             GridDesc_M0,
-                                             decltype(thread_desc_m0),
-                                             Sequence<ScalarPerVector>, // SliceLengths
-                                             Sequence<0>,               // DimAccessOrder
-                                             0,                         // SrcVectorDim
-                                             ScalarPerVector,
-                                             1, // SrcScalarStrideInVector
-                                             false>{a_grid_desc_m0, thread_store_global_offset};
-
-        auto b_global_write =
-            ThreadwiseTensorSliceTransfer_v1r3<BDataType,
-                                               BDataType,
-                                               decltype(thread_desc_m0),
-                                               GridDesc_M0,
-                                               PassThrough,
-                                               Sequence<ScalarPerVector>, // SliceLengths
-                                               Sequence<0>,               // DimAccessOrder
-                                               0,                         // DstVectorDim
-                                               ScalarPerVector,
-                                               InMemoryDataOperationEnum::Set,
-                                               1, // DstScalarStrideInVector
-                                               false>{
-                b_grid_desc_m0, thread_store_global_offset, PassThrough{}};
-
-        const index_t blockSize    = get_block_size();
-        const index_t blockPerGrid = get_grid_size();
-        const auto m0              = b_grid_desc_m0.GetLength(I0);
-        const index_t loop_step    = blockPerGrid * blockSize * ScalarPerVector;
-        const auto loop_step_index = make_multi_index(loop_step);
-
-        index_t num_iter = m0 / (loop_step);
-        do
-        {
-            // read and process ScalarPerVector elements
-            a_global_load.Run(
-                a_grid_desc_m0, a_global_buf, thread_desc_m0, make_tuple(I0), a_thread_buf);
-
-            static_for<0, ScalarPerVector, 1>{}([&](auto m) {
-                constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m));
-                functor(b_thread_buf(Number<offset>{}), a_thread_buf(Number<offset>{}));
-            });
-
-            b_global_write.Run(thread_desc_m0,
-                               make_tuple(I0), // SrcSliceOriginIdx
-                               b_thread_buf,
-                               b_grid_desc_m0,
-                               b_global_buf);
-
-            a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, loop_step_index);
-            b_global_write.MoveDstSliceWindow(b_grid_desc_m0, loop_step_index);
-        } while(--num_iter);
-    }
-};
-
-} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <thread>
+
+#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename InOutDataType, typename AccDataType>
+struct ReferenceBatchNormFwd_Input_N_H_W_C_Output_C : public device::DeviceBatchNormFwd<4, 3>
+{
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const std::array<index_t, 4> xyLengths,
+                 const std::array<index_t, 4> xStrides,
+                 const std::array<index_t, 4> yStrides,
+                 const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
+                 const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
+                 const InOutDataType* p_x,
+                 const AccDataType* bnScale,
+                 const AccDataType* bnBias,
+                 InOutDataType* p_y,
+                 double exponentialAverageFactor,
+                 AccDataType* resultRunningMean,
+                 AccDataType* resultRunningVariance,
+                 double epsilon,
+                 AccDataType* resultSaveMean,
+                 AccDataType* resultSaveInvVariance)
+            : p_x_(p_x),
+              bnScale_(bnScale),
+              bnBias_(bnBias),
+              p_y_(p_y),
+              resultRunningMean_(resultRunningMean),
+              resultRunningVariance_(resultRunningVariance),
+              resultSaveMean_(resultSaveMean),
+              resultSaveInvVariance_(resultSaveInvVariance),
+              exponentialAverageFactor_(exponentialAverageFactor),
+              epsilon_(epsilon)
+        {
+            (void)xStrides;
+            (void)yStrides;
+            (void)bnScaleBiasMeanVarStrides;
+
+            if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 ||
+               bnScaleBiasMeanVarLengths[0] != xyLengths[3])
+                throw std::runtime_error("Invalid tensor dimensions!");
+
+            n = xyLengths[0];
+            h = xyLengths[1];
+            w = xyLengths[2];
+            c = xyLengths[3];
+
+            resultSave    = (resultSaveMean != nullptr && resultSaveInvVariance != nullptr);
+            resultRunning = (resultRunningMean != nullptr && resultRunningVariance != nullptr);
+        }
+
+        const InOutDataType* p_x_;
+        const AccDataType* bnScale_;
+        const AccDataType* bnBias_;
+        InOutDataType* p_y_;
+
+        AccDataType* resultRunningMean_;
+        AccDataType* resultRunningVariance_;
+        AccDataType* resultSaveMean_;
+        AccDataType* resultSaveInvVariance_;
+
+        bool resultSave, resultRunning;
+
+        index_t n, h, w, c;
+
+        double exponentialAverageFactor_;
+        double epsilon_;
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            auto thread_reduce_func = [&](auto iC) {
+                AccDataType reduceSize = type_convert<AccDataType>(arg.n) *
+                                         type_convert<AccDataType>(arg.h) *
+                                         type_convert<AccDataType>(arg.w);
+                index_t offset_C       = iC;
+                AccDataType mean       = type_convert<AccDataType>(0.0f);
+                AccDataType meansquare = type_convert<AccDataType>(0.0f);
+
+                // compute mean, meanquare, variance, invVariance
+                for(index_t iN = 0; iN < arg.n; iN++)
+                {
+                    index_t offset_N = iN * arg.h * arg.w * arg.c;
+                    for(index_t iH = 0; iH < arg.h; iH++)
+                    {
+                        index_t offset_H = iH * arg.w * arg.c;
+                        for(index_t iW = 0; iW < arg.w; iW++)
+                        {
+                            index_t offset_W = iW * arg.c;
+
+                            auto offset = offset_N + offset_H + offset_W + offset_C;
+
+                            AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
+
+                            mean += x;
+                            meansquare += x * x;
+                        };
+                    }
+                };
+
+                mean       = mean / reduceSize;
+                meansquare = meansquare / reduceSize;
+
+                AccDataType variance = meansquare - mean * mean;
+                AccDataType invVariance =
+                    type_convert<AccDataType>(1.0f) /
+                    std::sqrt(type_convert<AccDataType>(arg.epsilon_) + variance);
+
+                // save the mean/invVariance if required
+                if(arg.resultSave)
+                {
+                    arg.resultSaveMean_[iC]        = mean;
+                    arg.resultSaveInvVariance_[iC] = invVariance;
+                };
+
+                // update the moving average if required
+                if(arg.resultRunning)
+                {
+                    arg.resultRunningMean_[iC] =
+                        arg.resultRunningMean_[iC] *
+                            type_convert<AccDataType>(1.0 - arg.exponentialAverageFactor_) +
+                        mean * arg.exponentialAverageFactor_;
+                    arg.resultRunningVariance_[iC] =
+                        arg.resultRunningVariance_[iC] *
+                            type_convert<AccDataType>(1.0 - arg.exponentialAverageFactor_) +
+                        variance * arg.exponentialAverageFactor_;
+                };
+
+                // Normalization
+                for(index_t iN = 0; iN < arg.n; iN++)
+                {
+                    index_t offset_N = iN * arg.h * arg.w * arg.c;
+                    for(index_t iH = 0; iH < arg.h; iH++)
+                    {
+                        index_t offset_H = iH * arg.w * arg.c;
+                        for(index_t iW = 0; iW < arg.w; iW++)
+                        {
+                            index_t offset_W = iW * arg.c;
+
+                            auto offset = offset_N + offset_H + offset_W + offset_C;
+
+                            AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
+
+                            AccDataType norm_x =
+                                arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC];
+
+                            arg.p_y_[offset] = type_convert<InOutDataType>(norm_x);
+                        };
+                    }
+                };
+            };
+
+            std::size_t num_thread      = std::thread::hardware_concurrency();
+            std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t ic_begin = it * work_per_thread;
+                std::size_t ic_end = std::min(static_cast<int>((it + 1) * work_per_thread), arg.c);
+
+                auto f = [=] {
+                    for(std::size_t ic = ic_begin; ic < ic_end; ++ic)
+                    {
+                        thread_reduce_func(ic);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+
+            return (0.0f);
+        };
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        };
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        (void)p_arg;
+
+        return (true);
+    };
+
+    std::unique_ptr<device::BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
+                        const std::array<index_t, 4> xStrides,
+                        const std::array<index_t, 4> yStrides,
+                        const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
+                        const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
+                        const void* p_x,
+                        const void* bnScale,
+                        const void* bnBias,
+                        void* p_y,
+                        double exponentialAverageFactor,
+                        void* resultRunningMean,
+                        void* resultRunningVariance,
+                        double epsilon,
+                        void* resultSaveMean,
+                        void* resultSaveInvVariance) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          yStrides,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleBiasMeanVarStrides,
+                                          static_cast<const InOutDataType*>(p_x),
+                                          static_cast<const AccDataType*>(bnScale),
+                                          static_cast<const AccDataType*>(bnBias),
+                                          static_cast<InOutDataType*>(p_y),
+                                          exponentialAverageFactor,
+                                          static_cast<AccDataType*>(resultRunningMean),
+                                          static_cast<AccDataType*>(resultRunningVariance),
+                                          epsilon,
+                                          static_cast<AccDataType*>(resultSaveMean),
+                                          static_cast<AccDataType*>(resultSaveInvVariance));
+    };
+
+    std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <array>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename InOutDataType, typename AccDataType>
+struct ReferenceBatchNormInfer_Input_N_H_W_C_Output_C : public device::DeviceBatchNormInfer<4, 3>
+{
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const std::array<index_t, 4> xyLengths,
+                 const std::array<index_t, 4> xStrides,
+                 const std::array<index_t, 4> yStrides,
+                 const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
+                 const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
+                 const InOutDataType* p_x,
+                 const AccDataType* bnScale,
+                 const AccDataType* bnBias,
+                 double epsilon,
+                 const AccDataType* estimatedMean,
+                 const AccDataType* estimatedVariance,
+                 InOutDataType* p_y)
+            : p_x_(p_x),
+              bnScale_(bnScale),
+              bnBias_(bnBias),
+              epsilon_(epsilon),
+              estimatedMean_(estimatedMean),
+              estimatedVariance_(estimatedVariance),
+              p_y_(p_y)
+        {
+            (void)xStrides;
+            (void)yStrides;
+            (void)bnScaleBiasMeanVarStrides;
+
+            if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 ||
+               bnScaleBiasMeanVarLengths[0] != xyLengths[3])
+                throw std::runtime_error("Invalid tensor dimensions!");
+
+            n = xyLengths[0];
+            h = xyLengths[1];
+            w = xyLengths[2];
+            c = xyLengths[3];
+        }
+
+        const InOutDataType* p_x_;
+        const AccDataType* bnScale_;
+        const AccDataType* bnBias_;
+
+        double epsilon_;
+
+        const AccDataType* estimatedMean_;
+        const AccDataType* estimatedVariance_;
+
+        InOutDataType* p_y_;
+
+        index_t n, h, w, c;
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            auto thread_reduce_func = [&](auto iC) {
+                index_t offset_C     = iC;
+                AccDataType mean     = arg.estimatedMean_[offset_C];
+                AccDataType variance = arg.estimatedVariance_[offset_C];
+
+                AccDataType invVariance =
+                    type_convert<AccDataType>(1.0f) /
+                    std::sqrt(type_convert<AccDataType>(arg.epsilon_) + variance);
+
+                // Normalization
+                for(index_t iN = 0; iN < arg.n; iN++)
+                {
+                    index_t offset_N = iN * arg.h * arg.w * arg.c;
+                    for(index_t iH = 0; iH < arg.h; iH++)
+                    {
+                        index_t offset_H = iH * arg.w * arg.c;
+                        for(index_t iW = 0; iW < arg.w; iW++)
+                        {
+                            index_t offset_W = iW * arg.c;
+
+                            auto offset = offset_N + offset_H + offset_W + offset_C;
+
+                            AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
+
+                            AccDataType norm_x =
+                                arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC];
+
+                            arg.p_y_[offset] = type_convert<InOutDataType>(norm_x);
+                        };
+                    }
+                };
+            };
+
+            std::size_t num_thread      = std::thread::hardware_concurrency();
+            std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t ic_begin = it * work_per_thread;
+                std::size_t ic_end = std::min(static_cast<int>((it + 1) * work_per_thread), arg.c);
+
+                auto f = [=] {
+                    for(std::size_t ic = ic_begin; ic < ic_end; ++ic)
+                    {
+                        thread_reduce_func(ic);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+
+            return (0.0f);
+        };
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        };
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        (void)p_arg;
+
+        return (true);
+    };
+
+    std::unique_ptr<device::BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
+                        const std::array<index_t, 4> xStrides,
+                        const std::array<index_t, 4> yStrides,
+                        const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
+                        const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
+                        const void* p_x,
+                        const void* bnScale,
+                        const void* bnBias,
+                        double epsilon,
+                        const void* estimatedMean,
+                        const void* estimatedVariance,
+                        void* p_y) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          yStrides,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleBiasMeanVarStrides,
+                                          static_cast<const InOutDataType*>(p_x),
+                                          static_cast<const AccDataType*>(bnScale),
+                                          static_cast<const AccDataType*>(bnBias),
+                                          epsilon,
+                                          static_cast<const AccDataType*>(estimatedMean),
+                                          static_cast<const AccDataType*>(estimatedVariance),
+                                          static_cast<InOutDataType*>(p_y));
+    };
+
+    std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
@@ -17,9 +17,12 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-using Normalize = ck::tensor_operation::element_wise::Normalize;
-using DeviceNormalizeFromMeanMeanSquarePtr =
-    ck::tensor_operation::device::DeviceElementwisePtr<5, 1, 2, Normalize>;
+using Normalize                            = ck::tensor_operation::element_wise::Normalize;
+using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device::DeviceElementwiseBasePtr<
+    Tuple<half_t, float, float, half_t, half_t>,
+    Tuple<half_t>,
+    Normalize,
+    2>;

 void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
    std::vector<DeviceNormalizeFromMeanMeanSquarePtr>& instances);

--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp
@@ -5,6 +5,7 @@

 #include <cmath>
 #include <numeric>
+#include <random>

 #include "ck/ck.hpp"

@@ -126,6 +127,23 @@ struct GeneratorTensor_3<ck::bhalf_t>
    }
 };

+template <typename T>
+struct GeneratorTensor_4
+{
+    std::default_random_engine generator;
+    std::normal_distribution<float> distribution;
+
+    GeneratorTensor_4(float mean, float stddev) : generator(1), distribution(mean, stddev){};
+
+    template <typename... Is>
+    T operator()(Is...)
+    {
+        float tmp = distribution(generator);
+
+        return ck::type_convert<T>(tmp);
+    }
+};
+
 struct GeneratorTensor_Checkboard
 {
    template <typename... Ts>

--- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
@@ -6,7 +6,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"

 namespace ck {
@@ -27,19 +27,17 @@ using outputType     = F16;
 using Normalize = ck::tensor_operation::element_wise::Normalize;
 using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
    // clang-format off
-    //###################|in | mean| square_mean| gamma| beta| out| ComputeDataType|  functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
-    //###################|in | mean| square_mean| gamma| beta| out| ComputeDataType|  functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
-    //###################|in | mean| square_mean| gamma| beta| out| ComputeDataType|  functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
-    //###################|in | mean| square_mean| gamma| beta| out| ComputeDataType|  functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
-    Device5AryElementwise<F16,  F32,         F32,   F16,  F16, F16,            F32, Normalize,    2,          8,  8,    1,           1,     8,    8,   8                >,
-    Device5AryElementwise<F16,  F32,         F32,   F16,  F16, F16,            F32, Normalize,    2,          4,  4,    1,           1,     4,    4,   4                >,
-    Device5AryElementwise<F16,  F32,         F32,   F16,  F16, F16,            F32, Normalize,    2,          2,  2,    1,           1,     2,    2,   2                >,
-    Device5AryElementwise<F16,  F32,         F32,   F16,  F16, F16,            F32, Normalize,    2,          1,  1,    1,           1,     1,    1,   1                >
+    //###################|<in, mean, square_mean, gamma, beta>| <out>|  functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
+    DeviceElementwise<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   8,       Sequence<8, 1, 1, 8, 8>,      Sequence<8>                >,
+    DeviceElementwise<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   4,       Sequence<4, 1, 1, 4, 4>,      Sequence<4>                >,
+    DeviceElementwise<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >,
+    DeviceElementwise<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   1,       Sequence<1, 1, 1, 1, 1>,      Sequence<1>                >
    // clang-format on
    >;

 void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
-    std::vector<DeviceElementwisePtr<5, 1, 2, Normalize>>& instances)
+    std::vector<DeviceElementwiseBasePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});