Batchnorm splitk single kernel (#771)

* Use dim 0 as faster dim for writing mean/var/count workspace in batchnorm multiblock method [performance] * Add CountDataType as template parameter in blockwise_welford * Add utility/get_shift.hpp * Add BatchNorm multiblock single-kernel implementation * Add smem inline assembly based implementation of gms_init/gms_barrier/gms_reset for gfx90a * Renaming in device_batchnorm_forward_impl.hpp * Tiny fix in the batchnorm_fwd profiler * Revert "Add smem inline assembly based implementation of gms_init/gms_barrier/gms_reset for gfx90a" This reverts commit d16d00919c43f10759e7b4e4d112125221ed9064. * Use the old two-kernel batchnorm multiblock method for gfx1030 * Use the old two-kernel batchnorm multiblock method for gfx908 * use the single-kernel batchnorm multiblock method only for gfx90a * Remove get_wave_id() from utility/get_id.hpp since it is not used * Set true for testing running mean/variance and saving mean/invvariance in the examples * Fix to copy-right words * Remove un-needed including in utility/get_id.hpp * Add comments to workgroup_synchronization.hpp * Remove un-used codes in gridwise_multiblock_batchnorm_forward.hpp * Renaming in the kernels * Remove un-used kernel file

Batchnorm splitk single kernel (#771)
* Use dim 0 as faster dim for writing mean/var/count workspace in batchnorm multiblock method [performance] * Add CountDataType as template parameter in blockwise_welford * Add utility/get_shift.hpp * Add BatchNorm multiblock single-kernel implementation * Add smem inline assembly based implementation of gms_init/gms_barrier/gms_reset for gfx90a * Renaming in device_batchnorm_forward_impl.hpp * Tiny fix in the batchnorm_fwd profiler * Revert "Add smem inline assembly based implementation of gms_init/gms_barrier/gms_reset for gfx90a" This reverts commit d16d00919c43f10759e7b4e4d112125221ed9064. * Use the old two-kernel batchnorm multiblock method for gfx1030 * Use the old two-kernel batchnorm multiblock method for gfx908 * use the single-kernel batchnorm multiblock method only for gfx90a * Remove get_wave_id() from utility/get_id.hpp since it is not used * Set true for testing running mean/variance and saving mean/invvariance in the examples * Fix to copy-right words * Remove un-needed including in utility/get_id.hpp * Add comments to workgroup_synchronization.hpp * Remove un-used codes in gridwise_multiblock_batchnorm_forward.hpp * Renaming in the kernels * Remove un-used kernel file
8f5cafaf · Qianfeng · GitHub · f4dfc060 · 8f5cafaf · 8f5cafaf
Unverified Commit 8f5cafaf authored Jul 06, 2023 by Qianfeng Committed by GitHub Jul 06, 2023
14 changed files
--- a/example/34_batchnorm/CMakeLists.txt
+++ b/example/34_batchnorm/CMakeLists.txt
 add_example_executable(example_batchnorm_forward_training batchnorm_forward_training_nhwc.cpp)
+add_example_executable(example_batchnorm_forward_training_obsolete batchnorm_forward_training_nhwc_obsolete.cpp)
 add_example_executable(example_batchnorm_forward_inferring batchnorm_forward_inferring_nhwc.cpp)
 add_example_executable(example_batchnorm_backward batchnorm_backward_nhwc.cpp)
--- a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
@@ -414,7 +414,7 @@ bool bnorm_fwd_nhwc_test(bool do_verification,
        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());

        y_dev.FromDevice(y.mData.data());
-        pass = pass && ck::utils::check_err(y, y_ref);
+        pass = pass && ck::utils::check_err(y, y_ref, "Incorrect normalized output values");

        if(updateMovingAverage)
        {
@@ -424,8 +424,12 @@ bool bnorm_fwd_nhwc_test(bool do_verification,
            resultRunningMean_dev.FromDevice(resultRunningMean.mData.data());
            resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data());

-            pass = pass && ck::utils::check_err(resultRunningMean, resultRunningMean_ref);
-            pass = pass && ck::utils::check_err(resultRunningVariance, resultRunningVariance_ref);
+            pass = pass && ck::utils::check_err(resultRunningMean,
+                                                resultRunningMean_ref,
+                                                "Incorrect running mean values");
+            pass = pass && ck::utils::check_err(resultRunningVariance,
+                                                resultRunningVariance_ref,
+                                                "Incorrect running variance values");
        };

        if(saveMeanAndInvVariance)
@@ -438,8 +442,11 @@ bool bnorm_fwd_nhwc_test(bool do_verification,
            resultSaveMean_dev.FromDevice(resultSaveMean.mData.data());
            resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data());

-            pass = pass && ck::utils::check_err(resultSaveMean, resultSaveMean_ref);
-            pass = pass && ck::utils::check_err(resultSaveInvVariance, resultSaveInvVariance_ref);
+            pass = pass && ck::utils::check_err(
+                               resultSaveMean, resultSaveMean_ref, "Incorrect saved mean values");
+            pass = pass && ck::utils::check_err(resultSaveInvVariance,
+                                                resultSaveInvVariance_ref,
+                                                "Incorrect saved invvariance values");
        };
    };


--- a/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp
+++ b/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
@@ -4,7 +4,7 @@
 #pragma once

 #include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/get_shift.hpp"

 namespace ck {

@@ -35,10 +35,11 @@ struct BlockwiseWelford
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});

+    template <typename CountDataType>
    __device__ static inline void
-    Merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
+    Merge(T& mean_a, T& var_a, CountDataType& count_a, T mean_b, T var_b, CountDataType count_b)
    {
-        int count            = count_a + count_b;
+        CountDataType count  = count_a + count_b;
        T count_b_over_count = count == 0 ? type_convert<T>(0) : type_convert<T>(count_b) / count;
        T delta              = mean_b - mean_a;
        mean_a += delta * count_b_over_count;
@@ -46,11 +47,12 @@ struct BlockwiseWelford
        count_a = count;
    }

-    __device__ static void Run(T& mean_value, T& var_value, int& count)
+    template <typename CountDataType>
+    __device__ static void Run(T& mean_value, T& var_value, CountDataType& count)
    {
        __shared__ T mean_block_buf[BlockSize];
        __shared__ T var_block_buf[BlockSize];
-        __shared__ int count_block_buf[BlockSize];
+        __shared__ CountDataType count_block_buf[BlockSize];

        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();

@@ -76,13 +78,13 @@ struct BlockwiseWelford
                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
                                                                     make_tuple(0, indOffset));

-                T mean1    = mean_block_buf[offset1];
-                T var1     = var_block_buf[offset1];
-                int count1 = count_block_buf[offset1];
+                T mean1              = mean_block_buf[offset1];
+                T var1               = var_block_buf[offset1];
+                CountDataType count1 = count_block_buf[offset1];

-                T mean2    = mean_block_buf[offset2];
-                T var2     = var_block_buf[offset2];
-                int count2 = count_block_buf[offset2];
+                T mean2              = mean_block_buf[offset2];
+                T var2               = var_block_buf[offset2];
+                CountDataType count2 = count_block_buf[offset2];

                Merge(mean1, var1, count1, mean2, var2, count2);


--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
@@ -4,7 +4,7 @@
 #pragma once

 #include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/get_shift.hpp"
 #include "ck/utility/reduction_functions_accumulate.hpp"

 namespace ck {

--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
@@ -10,12 +10,14 @@
 #include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
 #include "ck/tensor_operation/gpu/device/welford_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp"
 #include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp"
-#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final_obsolete.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/hip_check_error.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -114,8 +116,8 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,

    static auto MakeMeanVarCountOutputMG2dDescriptor(int invariantLength, int blkGroupSize)
    {
-        const auto grid_desc_m_g =
-            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
+        const auto grid_desc_m_g = make_naive_tensor_descriptor(
+            make_tuple(invariantLength, blkGroupSize), make_tuple(1, invariantLength));

        const auto mPad =
            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
@@ -132,9 +134,9 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,

    static auto MakeMeanVarCountInputMK2dDescriptor(int invariantLength, int blkGroupSize)
    {
-        const auto reduceLength = blkGroupSize;
-        const auto grid_desc_m_k =
-            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, reduceLength));
+        const auto reduceLength  = blkGroupSize;
+        const auto grid_desc_m_k = make_naive_tensor_descriptor(
+            make_tuple(invariantLength, reduceLength), make_tuple(1, invariantLength));

        const auto mPad =
            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
@@ -244,8 +246,8 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
                    int testBlkGroupSize = (reduce_length_ + (K_BlockTileSize * iterations) - 1) /
                                           (K_BlockTileSize * iterations);

-                    // we want the blkGroupSize be not more than 128
-                    if(testBlkGroupSize <= 128)
+                    // we want the blkGroupSize be not more than 16
+                    if(testBlkGroupSize <= 16)
                        break;

                    iterations++;
@@ -319,6 +321,8 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
        void* workspace_mean_;
        void* workspace_variance_;
        void* workspace_count_;
+
+        void* control_;
    };

    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
@@ -340,6 +344,11 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
            // workspace for welford intermediate count
            workspace_size +=
                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(int32_t) + 64;
+
+            // workspace for barrier objects, each barrier object consists of two integers
+            // TODO: allocate barrier object memory globally to reuse it by other operators
+            workspace_size += (pArg_->invariant_length_ + M_BlockTileSize - 1) / M_BlockTileSize *
+                              sizeof(int) * 2;
        }

        return (workspace_size);
@@ -353,7 +362,6 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,

        if(UseMultiblockInK && pArg_->blkGroupSize_ > 1)
        {
-
            // setup buffer used for intermediate welford mean
            pArg_->workspace_mean_ = static_cast<char*>(pArg_->p_workspace_);

@@ -374,6 +382,18 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
            // setup buffer used for intermediate welfor count
            pArg_->workspace_count_ =
                reinterpret_cast<char*>(pArg_->workspace_variance_) + variance_space_sz;
+
+            index_t count_space_sz =
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(int32_t);
+
+            count_space_sz = math::integer_least_multiple(count_space_sz, 64);
+
+            pArg_->control_ = reinterpret_cast<char*>(pArg_->workspace_count_) + count_space_sz;
+
+            index_t control_space_sz = (pArg_->invariant_length_ + M_BlockTileSize - 1) /
+                                       M_BlockTileSize * sizeof(int) * 2;
+
+            hip_check_error(hipMemset(pArg_->control_, 0, control_space_sz));
        };
    };

@@ -402,6 +422,32 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
                using MeanVarCountGridDesc_M_G = decltype(mean_var_count_grid_desc_m_g);
                using MeanVarCountGridDesc_M_K = decltype(mean_var_count_grid_desc_m_k);

+                using GridwiseMultiblockBatchNormForward_ =
+                    GridwiseMultiblockBatchNormForward<XDataType,
+                                                       YDataType,
+                                                       AccDataType,
+                                                       ScaleDataType,
+                                                       BiasDataType,
+                                                       MeanVarDataType,
+                                                       YElementwiseOp,
+                                                       XYGridDesc_M_K,
+                                                       MeanVarCountGridDesc_M_G,
+                                                       MeanVarCountGridDesc_M_K,
+                                                       ScaleBiasMeanVarGridDesc_M,
+                                                       ScaleBiasMeanVarGridDesc_M,
+                                                       GetReduceCountPerThreadFunctor,
+                                                       BlockSize,
+                                                       MThreadClusterSize,
+                                                       KThreadClusterSize,
+                                                       MThreadSliceSize,
+                                                       KThreadSliceSize,
+                                                       XSrcYDstVectorDim,
+                                                       XSrcVectorSize,
+                                                       YDstVectorSize,
+                                                       ScaleSrcVectorSize,
+                                                       BiasSrcVectorSize,
+                                                       MeanVarSrcDstVectorSize>;
+
                using GridwiseMultiblockWelfordFirstHalf_ =
                    GridwiseMultiblockWelfordFirstHalf<XDataType,
                                                       AccDataType,
@@ -441,78 +487,136 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
                                                                   BiasSrcVectorSize,
                                                                   MeanVarSrcDstVectorSize>;

-                index_t numMeanVarCountBlockTileIteration =
-                    (arg.blkGroupSize_ + KThreadClusterSize - 1) / KThreadClusterSize;
-
-                const auto kern_multiblock_welford_first_half =
-                    kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
-                                                         XDataType,
-                                                         MeanVarDataType,
-                                                         XYGridDesc_M_K,
-                                                         MeanVarCountGridDesc_M_G,
-                                                         GetReduceCountPerThreadFunctor>;
-
-                const auto kern_welford_second_half_batchnorm_forward_final =
-                    kernel_welford_second_half_batchnorm_forward_final<
-                        GridwiseWelfordSecondHalfBatchNormForwardFinal_,
-                        XDataType,
-                        YDataType,
-                        AccDataType,
-                        ScaleDataType,
-                        BiasDataType,
-                        MeanVarDataType,
-                        YElementwiseOp,
-                        XYGridDesc_M_K,
-                        MeanVarCountGridDesc_M_K,
-                        ScaleBiasMeanVarGridDesc_M,
-                        ScaleBiasMeanVarGridDesc_M>;
-
-                avg_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kern_multiblock_welford_first_half,
-                                           dim3(arg.gridSize_),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.x_grid_desc_m_k_,
-                                           mean_var_count_grid_desc_m_g,
-                                           get_reduce_count_per_thread,
-                                           arg.numBlockTileIteration_,
-                                           arg.p_x_,
-                                           static_cast<MeanVarDataType*>(arg.workspace_mean_),
-                                           static_cast<MeanVarDataType*>(arg.workspace_variance_),
-                                           static_cast<int32_t*>(arg.workspace_count_));
-
-                avg_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kern_welford_second_half_batchnorm_forward_final,
-                                           dim3(arg.gridSize_),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.x_grid_desc_m_k_,
-                                           arg.y_grid_desc_m_k_,
-                                           mean_var_count_grid_desc_m_k,
-                                           arg.scale_grid_desc_m_,
-                                           arg.bias_grid_desc_m_,
-                                           arg.mean_var_grid_desc_m_,
-                                           arg.blkGroupSize_,
-                                           arg.numBlockTileIteration_,
-                                           numMeanVarCountBlockTileIteration,
-                                           arg.epsilon_,
-                                           static_cast<MeanVarDataType*>(arg.workspace_mean_),
-                                           static_cast<MeanVarDataType*>(arg.workspace_variance_),
-                                           static_cast<int32_t*>(arg.workspace_count_),
-                                           arg.p_x_,
-                                           arg.p_scale_,
-                                           arg.p_bias_,
-                                           arg.y_elementwise_op_,
-                                           arg.p_y_,
-                                           arg.updateMovingAverage_,
-                                           arg.averageFactor_,
-                                           arg.resultRunningMean_,
-                                           arg.resultRunningVariance_,
-                                           arg.saveMeanInvVariance_,
-                                           arg.resultSaveMean_,
-                                           arg.resultSaveInvVariance_);
+                // It is found that:
+                // 1) gfx1030 does not support the GLC enabled vector load/store, so using the
+                //    two-kernel method for gfx1030
+                // 2) Profiler on gfx908 could hang even though it works when running examples
+                // 3) Single-kernel method works on gfx1100, but the performance it not better
+                //    than two-kernel method (due to more warps participating the barrier)
+                if(ck::get_device_name() == "gfx90a")
+                {
+                    const auto kern_multiblock_batchnorm_fwd_ =
+                        kernel_multiblock_batchnorm_forward<GridwiseMultiblockBatchNormForward_,
+                                                            XDataType,
+                                                            YDataType,
+                                                            AccDataType,
+                                                            ScaleDataType,
+                                                            BiasDataType,
+                                                            MeanVarDataType,
+                                                            YElementwiseOp,
+                                                            XYGridDesc_M_K,
+                                                            MeanVarCountGridDesc_M_G,
+                                                            MeanVarCountGridDesc_M_K,
+                                                            ScaleBiasMeanVarGridDesc_M,
+                                                            ScaleBiasMeanVarGridDesc_M,
+                                                            GetReduceCountPerThreadFunctor>;
+
+                    avg_time += launch_and_time_kernel(
+                        stream_config,
+                        kern_multiblock_batchnorm_fwd_,
+                        dim3(arg.gridSize_),
+                        dim3(BlockSize),
+                        0,
+                        arg.x_grid_desc_m_k_,
+                        arg.y_grid_desc_m_k_,
+                        mean_var_count_grid_desc_m_g, // for writing to mean/variance/count
+                                                      // workspace by multiple workgroups
+                        mean_var_count_grid_desc_m_k, // for reading from mean/variance/count
+                                                      // workspace by each workgroup
+                        arg.scale_grid_desc_m_,
+                        arg.bias_grid_desc_m_,
+                        arg.mean_var_grid_desc_m_,
+                        get_reduce_count_per_thread,
+                        arg.numBlockTileIteration_,
+                        arg.epsilon_,
+                        arg.p_x_,
+                        static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                        static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                        static_cast<int32_t*>(arg.workspace_count_),
+                        static_cast<int*>(arg.control_),
+                        arg.p_scale_,
+                        arg.p_bias_,
+                        arg.y_elementwise_op_,
+                        arg.p_y_,
+                        arg.updateMovingAverage_, // true or false
+                        arg.averageFactor_,
+                        arg.resultRunningMean_,
+                        arg.resultRunningVariance_,
+                        arg.saveMeanInvVariance_, // true or false
+                        arg.resultSaveMean_,
+                        arg.resultSaveInvVariance_);
+                }
+                else
+                {
+                    const auto kern_multiblock_welford_first_half =
+                        kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
+                                                             XDataType,
+                                                             MeanVarDataType,
+                                                             XYGridDesc_M_K,
+                                                             MeanVarCountGridDesc_M_G,
+                                                             GetReduceCountPerThreadFunctor>;
+
+                    const auto kern_welford_second_half_batchnorm_forward_final =
+                        kernel_welford_second_half_batchnorm_forward_final<
+                            GridwiseWelfordSecondHalfBatchNormForwardFinal_,
+                            XDataType,
+                            YDataType,
+                            AccDataType,
+                            ScaleDataType,
+                            BiasDataType,
+                            MeanVarDataType,
+                            YElementwiseOp,
+                            XYGridDesc_M_K,
+                            MeanVarCountGridDesc_M_K,
+                            ScaleBiasMeanVarGridDesc_M,
+                            ScaleBiasMeanVarGridDesc_M>;
+
+                    avg_time += launch_and_time_kernel(
+                        stream_config,
+                        kern_multiblock_welford_first_half,
+                        dim3(arg.gridSize_),
+                        dim3(BlockSize),
+                        0,
+                        arg.x_grid_desc_m_k_,
+                        mean_var_count_grid_desc_m_g,
+                        get_reduce_count_per_thread,
+                        arg.numBlockTileIteration_,
+                        arg.p_x_,
+                        static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                        static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                        static_cast<int32_t*>(arg.workspace_count_));
+
+                    avg_time += launch_and_time_kernel(
+                        stream_config,
+                        kern_welford_second_half_batchnorm_forward_final,
+                        dim3(arg.gridSize_),
+                        dim3(BlockSize),
+                        0,
+                        arg.x_grid_desc_m_k_,
+                        arg.y_grid_desc_m_k_,
+                        mean_var_count_grid_desc_m_k,
+                        arg.scale_grid_desc_m_,
+                        arg.bias_grid_desc_m_,
+                        arg.mean_var_grid_desc_m_,
+                        arg.blkGroupSize_,
+                        arg.numBlockTileIteration_,
+                        arg.epsilon_,
+                        static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                        static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                        static_cast<int32_t*>(arg.workspace_count_),
+                        arg.p_x_,
+                        arg.p_scale_,
+                        arg.p_bias_,
+                        arg.y_elementwise_op_,
+                        arg.p_y_,
+                        arg.updateMovingAverage_,
+                        arg.averageFactor_,
+                        arg.resultRunningMean_,
+                        arg.resultRunningVariance_,
+                        arg.saveMeanInvVariance_,
+                        arg.resultSaveMean_,
+                        arg.resultSaveInvVariance_);
+                };
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
@@ -161,7 +161,7 @@ struct GridwiseMultiblockWelfordFirstHalf
                                               PassThroughOp,
                                               ThreadBufferLengths_M_1,
                                               Sequence<0, 1>,
-                                               1,
+                                               0,
                                               1,
                                               InMemoryDataOperationEnum::Set,
                                               1,
@@ -180,7 +180,7 @@ struct GridwiseMultiblockWelfordFirstHalf
                                               PassThroughOp,
                                               ThreadBufferLengths_M_1,
                                               Sequence<0, 1>,
-                                               1,
+                                               0,
                                               1,
                                               InMemoryDataOperationEnum::Set,
                                               1,

--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
@@ -33,7 +33,6 @@ __global__ void kernel_welford_second_half_batchnorm_forward_final(
    const MeanVarGridDesc_M mean_var_grid_desc_m,
    index_t blkgroup_size,
    index_t num_xy_k_block_tile_iteration,
-    index_t num_mean_var_count_k_block_tile_iteration,
    AccDataType epsilon,
    const MeanVarDataType* const __restrict__ p_in_welford_mean,
    const MeanVarDataType* const __restrict__ p_in_welford_variance,
@@ -59,7 +58,6 @@ __global__ void kernel_welford_second_half_batchnorm_forward_final(
                                                         mean_var_grid_desc_m,
                                                         blkgroup_size,
                                                         num_xy_k_block_tile_iteration,
-                                                         num_mean_var_count_k_block_tile_iteration,
                                                         epsilon,
                                                         p_in_welford_mean,
                                                         p_in_welford_variance,
@@ -152,7 +150,6 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
                               index_t blkgroup_size,
                               index_t num_xy_k_block_tile_iteration,
-                               index_t num_mean_var_count_k_block_tile_iteration,
                               AccDataType epsilon,
                               const MeanVarDataType* const __restrict__ p_in_welford_mean,
                               const MeanVarDataType* const __restrict__ p_in_welford_variance,
@@ -223,7 +220,7 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
                                             decltype(thread_buffer_desc_m_1),
                                             ThreadBufferLengths_M_1,
                                             Sequence<0, 1>,
-                                             1,
+                                             0,
                                             1,
                                             1,
                                             true>(
@@ -239,7 +236,7 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
                                             decltype(thread_buffer_desc_m_1),
                                             ThreadBufferLengths_M_1,
                                             Sequence<0, 1>,
-                                             1,
+                                             0,
                                             1,
                                             1,
                                             true>(
@@ -257,9 +254,6 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
        const auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_welford_count, mean_var_count_grid_desc_m_k.GetElementSpaceSize());

-        constexpr auto mean_var_count_thread_copy_step_m_k =
-            make_multi_index(0, KThreadClusterSize * 1);
-
        // Step 1: do final welford reduction to get mean and variance

        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
@@ -268,8 +262,11 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
            welford_count_thread_buf(I) = 0;
        });

-        for(index_t reducedTiles = 0; reducedTiles < num_mean_var_count_k_block_tile_iteration;
-            ++reducedTiles)
+        constexpr auto mean_var_count_thread_copy_step_m_k =
+            make_multi_index(0, KThreadClusterSize);
+
+        int32_t reducedSize = 0;
+        while(reducedSize < blkgroup_size)
        {
            threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
                                             welford_mean_global_val_buf,
@@ -296,6 +293,8 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
                                   welford_var_thread_buf,
                                   welford_count_thread_buf);

+            reducedSize += KThreadClusterSize;
+
            threadwise_mean_var_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
                                                            mean_var_count_thread_copy_step_m_k);
            threadwise_count_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,

--- a/include/ck/utility/get_shift.hpp
+++ b/include/ck/utility/get_shift.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+
+template <index_t N>
+static constexpr __device__ index_t get_shift()
+{
+    return (get_shift<N / 2>() + 1);
+};
+
+template <>
+constexpr __device__ index_t get_shift<1>()
+{
+    return (0);
+}
+
+} // namespace ck
--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -25,16 +25,4 @@ struct float_equal_zero
    };
 };

-template <index_t N>
-static constexpr __device__ index_t get_shift()
-{
-    return (get_shift<N / 2>() + 1);
-};
-
-template <>
-constexpr __device__ index_t get_shift<1>()
-{
-    return (0);
-}
-
 } // namespace ck
--- a/include/ck/utility/workgroup_synchronization.hpp
+++ b/include/ck/utility/workgroup_synchronization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck/host_utility/hip_check_error.hpp"
+
+namespace ck {
+
+// Initialization flag of Barrier object, can be any value except for zero
+static constexpr int BarrierInitFlag = 0x7856;
+
+// 1) only the first thread-block in the synchronizaton group is supposed to call this function. It
+// is the responsibility of the user to ensure the two integer values in p_control_bits are zeros
+// before calling gms_init().
+// 2) Aftercalling gms_reset(), the two integer values in p_control_bits will be zeros, so no
+// repetitious initialization of p_control_bits buffer is required
+static __device__ void gms_init(int NumWarps, int* p_control_bits)
+{
+    union
+    {
+        int two32[2];
+        unsigned long one64;
+    } regs;
+
+    regs.two32[0] = BarrierInitFlag;
+    regs.two32[1] = NumWarps;
+
+    if(threadIdx.x == 0)
+        atomicCAS(reinterpret_cast<unsigned long*>(p_control_bits), 0, regs.one64);
+};
+
+// all the workgroups in the synchronization group is supposed to call this function
+static __device__ void gms_barrier(int* p_control_bits)
+{
+    constexpr int mask = warpSize - 1;
+
+    if((threadIdx.x & mask) == 0)
+    {
+        // ensure the barrier object is initialized
+        do
+        {
+            const int r0 = __atomic_load_n(&p_control_bits[0], __ATOMIC_RELAXED);
+
+            if(r0 == BarrierInitFlag)
+                break;
+
+        } while(true);
+
+        // go ahead toward the barrier line
+        atomicSub(&p_control_bits[1], 1);
+
+        // wait until all warps have arrived
+        do
+        {
+            const int r1 = __atomic_load_n(&p_control_bits[1], __ATOMIC_RELAXED);
+
+            if(r1 == 0)
+                break;
+
+        } while(true);
+    };
+};
+
+// 1) Only the first thread-block in the synchronizaton group is supposed to call this function.
+// 2) Aftercalling gms_reset(), the two integer values in p_control_bits will be zeros, so no
+// repetitious initialization of p_control_bits buffer is required
+static __device__ void gms_reset(int* p_control_bits)
+{
+    // reset the barrier object
+    if(threadIdx.x == 0)
+        (void)atomicCAS(&p_control_bits[0], BarrierInitFlag, 0);
+};
+
+} // namespace ck
--- a/profiler/src/profile_batchnorm_fwd.cpp
+++ b/profiler/src/profile_batchnorm_fwd.cpp
@@ -148,7 +148,7 @@ int profile_batchnorm_forward(int argc, char* argv[])
    {
        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
        {
-            profile_batchnorm_forward_impl<F16, F16, F32, F16, F16, F16, 4, 3>(
+            profile_batchnorm_forward_impl<F16, F16, F32, F16, F16, F32, 4, 3>(
                arg_parser.do_verification,
                arg_parser.init_method,
                arg_parser.do_dumpout,