Update naive variance kernel

8745c0ca · rocking · 3b6f9c16 · 8745c0ca
Commit 8745c0ca authored Feb 10, 2023 by rocking
Hide whitespace changes
Inline Side-by-side

Showing with 291 additions and 155 deletions

include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp ...ration/gpu/grid/gridwise_normalization_naive_variance.hpp +291 -155

No files found.
--- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp
@@ -4,9 +4,8 @@
 #pragma once

 #include "ck/utility/data_type.hpp"
-#include "ck/utility/reduction_common.hpp"
+
 #include "ck/utility/reduction_operator.hpp"
-#include "ck/utility/reduction_functions_accumulate.hpp"
 #include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
 #include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
@@ -19,8 +18,8 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
-          typename AccDataType,
-          typename AccElementwiseOperation,
+          typename ComputeDataType,
+          typename YElementwiseOperation,
          typename GridDesc_M_K,
          index_t BlockSize,
          index_t MThreadClusterSize,
@@ -46,6 +45,10 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                      (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");

+    static_assert(XSrcVectorSize == YDstVectorSize);
+    static_assert(XSrcVectorSize == GammaSrcVectorSize);
+    static_assert(XSrcVectorSize == BetaSrcVectorSize);
+
    static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);

    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
@@ -59,19 +62,23 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});

+    using ThreadBufferLengths_M_K                = Sequence<MThreadSliceSize, XSrcVectorSize>;
+    static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
+
    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{})));
    using ThreadReduceDstDesc_M =
        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));

-    using BlockwiseSumReduce = PartitionedBlockwiseReduction<AccDataType,
+    using BlockwiseSumReduce = PartitionedBlockwiseReduction<ComputeDataType,
                                                             BlockSize,
                                                             ThreadClusterLengths_M_K,
                                                             ThreadClusterArrangeOrder,
                                                             reduce::Add,
                                                             true>;

-    using ThreadwiseSumReduce = ThreadwiseReduction<AccDataType,
+    using ThreadwiseSumReduce = ThreadwiseReduction<ComputeDataType,
                                                    ThreadReduceSrcDesc_M_K,
                                                    ThreadReduceDstDesc_M,
                                                    reduce::Add,
@@ -81,64 +88,77 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
    static constexpr auto I1 = Number<1>{};
    static constexpr auto I2 = Number<2>{};

-    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t M_BlockTileSize     = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize     = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize;
+
+    static constexpr auto ThreadBufferNumber = Number<KThreadSliceSize / XSrcVectorSize>{};

    __device__ static void Run(const GridDesc_M_K& x_grid_desc_m_k,
                               const GridDesc_M_K& gamma_grid_desc_m_k,
                               const GridDesc_M_K& beta_grid_desc_m_k,
                               const GridDesc_M_K& y_grid_desc_m_k,
                               index_t num_k_block_tile_iteration,
-                               AccDataType epsilon,
+                               ComputeDataType epsilon,
                               const XDataType* const __restrict__ p_x_global,
                               const GammaDataType* const __restrict__ p_gamma_global,
                               const BetaDataType* const __restrict__ p_beta_global,
                               YDataType* const __restrict__ p_y_global,
-                               const AccElementwiseOperation acc_elementwise_op)
+                               const YElementwiseOperation y_elementwise_op)
    {
-        if constexpr(SweepOnce)
-        {
-            num_k_block_tile_iteration = 1;
-        }
-
        // LDS
-        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
-
-        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
+        __shared__ ComputeDataType p_reduce_work_buffer[BlockSize];

        auto reduce_work_buf =
            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);

-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            x_thread_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            gamma_thread_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     AccDataType,
-                     MThreadSliceSize * KThreadSliceSize,
-                     true>& beta_thread_buf = gamma_thread_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            y_thread_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     AccDataType,
-                     MThreadSliceSize * KThreadSliceSize,
-                     true>& x_square_thread_buf = y_thread_buf;
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());

-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+        auto x_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * XSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+
+        auto gamma_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * GammaSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+
+        auto beta_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * BetaSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+
+        auto y_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * YDstVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+
+        auto& x_square_thread_buf = y_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
            mean_square_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>& var_thread_buf =
-            mean_square_thread_buf;
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            mean_thread_buf(I)        = reduce::Add::template GetIdentityValue<AccDataType>();
-            mean_square_thread_buf(I) = reduce::Add::template GetIdentityValue<AccDataType>();
-        });
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>&
+            var_thread_buf = mean_square_thread_buf;

        const index_t thread_local_id = get_thread_local_1d_id();
        const index_t block_global_id = get_block_1d_id();
@@ -149,12 +169,8 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
        const auto thread_m_cluster_id = thread_cluster_idx[I0];
        const auto thread_k_cluster_id = thread_cluster_idx[I1];

-        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
-        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
-
        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
-                                                                  AccDataType,
+                                                                  ComputeDataType,
                                                                  GridDesc_M_K,
                                                                  decltype(thread_buffer_desc_m_k),
                                                                  ThreadBufferLengths_M_K,
@@ -166,11 +182,11 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
            x_grid_desc_m_k,
            make_multi_index(block_global_id * M_BlockTileSize +
                                 thread_m_cluster_id * MThreadSliceSize,
-                             thread_k_cluster_id * KThreadSliceSize));
+                             thread_k_cluster_id * XSrcVectorSize));

        auto threadwise_gamma_load =
            ThreadwiseTensorSliceTransfer_v2<GammaDataType,
-                                             AccDataType,
+                                             ComputeDataType,
                                             GridDesc_M_K,
                                             decltype(thread_buffer_desc_m_k),
                                             ThreadBufferLengths_M_K,
@@ -182,11 +198,11 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                gamma_grid_desc_m_k,
                make_multi_index(block_global_id * M_BlockTileSize +
                                     thread_m_cluster_id * MThreadSliceSize,
-                                 thread_k_cluster_id * KThreadSliceSize));
+                                 thread_k_cluster_id * GammaSrcVectorSize));

        auto threadwise_beta_load =
            ThreadwiseTensorSliceTransfer_v2<BetaDataType,
-                                             AccDataType,
+                                             ComputeDataType,
                                             GridDesc_M_K,
                                             decltype(thread_buffer_desc_m_k),
                                             ThreadBufferLengths_M_K,
@@ -198,14 +214,14 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                beta_grid_desc_m_k,
                make_multi_index(block_global_id * M_BlockTileSize +
                                     thread_m_cluster_id * MThreadSliceSize,
-                                 thread_k_cluster_id * KThreadSliceSize));
+                                 thread_k_cluster_id * BetaSrcVectorSize));

        auto threadwise_y_store =
-            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
                                               YDataType,
                                               decltype(thread_buffer_desc_m_k),
                                               GridDesc_M_K,
-                                               AccElementwiseOperation,
+                                               YElementwiseOperation,
                                               ThreadBufferLengths_M_K,
                                               ThreadBufferDimAccessOrder,
                                               YDstVectorDim,
@@ -216,13 +232,10 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                y_grid_desc_m_k,
                make_multi_index(block_global_id * M_BlockTileSize +
                                     thread_m_cluster_id * MThreadSliceSize,
-                                 thread_k_cluster_id * KThreadSliceSize),
-                acc_elementwise_op);
+                                 thread_k_cluster_id * YDstVectorSize),
+                y_elementwise_op);

-        // Copy x from Cache
-        // one pass: fwd, second pass: bwd
-        constexpr auto thread_copy_fwd_step_m_k =
-            make_multi_index(0, SweepOnce ? 0 : K_BlockTileSize);
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
        constexpr auto thread_copy_bwd_step_m_k =
            make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);

@@ -239,121 +252,244 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
        // FIXME: Should not hack the transform from deviceOP
        int reduce_length = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];

-        index_t reducedTiles = 0;
-        do
-        {
-            threadwise_x_load.Run(x_grid_desc_m_k,
-                                  x_global_val_buf,
-                                  thread_buffer_desc_m_k,
-                                  make_tuple(I0, I0),
-                                  x_thread_buf);
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I)        = reduce::Add::template GetIdentityValue<ComputeDataType>();
+            mean_square_thread_buf(I) = reduce::Add::template GetIdentityValue<ComputeDataType>();
+        });

-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset_m_k =
-                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
-                    x_square_thread_buf(Number<offset_m_k>{}) =
-                        x_thread_buf(Number<offset_m_k>{}) * x_thread_buf(Number<offset_m_k>{});
+        // Separate sweep once and sweep twice pipeline
+        if constexpr(SweepOnce)
+        {
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_val_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf(i));
+
+                threadwise_gamma_load.Run(gamma_grid_desc_m_k,
+                                          gamma_global_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          gamma_thread_buf(i));
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                        x_square_thread_buf(i)(Number<offset_m_k>{}) =
+                            x_thread_buf(i)(Number<offset_m_k>{}) *
+                            x_thread_buf(i)(Number<offset_m_k>{});
+                    });
                });
-            });

-            ThreadwiseSumReduce::Reduce(x_thread_buf, mean_thread_buf);
-            ThreadwiseSumReduce::Reduce(x_square_thread_buf, mean_square_thread_buf);
+                ThreadwiseSumReduce::Reduce(x_thread_buf[i], mean_thread_buf);
+                ThreadwiseSumReduce::Reduce(x_square_thread_buf[i], mean_square_thread_buf);

-            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
-
-            ++reducedTiles;
-        } while(reducedTiles < num_k_block_tile_iteration);
+                if constexpr(i != ThreadBufferNumber - 1)
+                {
+                    threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                    threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                             thread_copy_fwd_step_m_k);
+                }
+            });

-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            if constexpr(I > 0)
-                block_sync_lds();
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();

-            BlockwiseSumReduce::Reduce(reduce_work_buf, mean_thread_buf(I));
-            mean_thread_buf(I) = mean_thread_buf(I) / reduce_length;
+                BlockwiseSumReduce::Reduce(reduce_work_buf, mean_thread_buf(I));
+                mean_thread_buf(I) = mean_thread_buf(I) / reduce_length;

-            block_sync_lds();
+                block_sync_lds();

-            BlockwiseSumReduce::Reduce(reduce_work_buf, mean_square_thread_buf(I));
-            mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length;
+                BlockwiseSumReduce::Reduce(reduce_work_buf, mean_square_thread_buf(I));
+                mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length;

-            // var(x) = E[x^2] - E[x]^2
-            var_thread_buf(I) =
-                mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I));
-        });
+                // var(x) = E[x^2] - E[x]^2
+                var_thread_buf(I) =
+                    mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I));
+            });

-        // y = (x - E[x]) / sqrt(var[x] + epsilon)
-        auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k;
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
+                    threadwise_beta_load.Run(beta_grid_desc_m_k,
+                                             beta_global_val_buf,
+                                             thread_buffer_desc_m_k,
+                                             make_tuple(I0, I0),
+                                             beta_thread_buf(iK0));
+                    if constexpr(iK0 != ThreadBufferNumber - 1)
+                        threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                                thread_copy_fwd_step_m_k);
+
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                        // normalize
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
+                            divisor;
+
+                        // gamma & beta
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) *
+                                gamma_thread_buf(iK0)(Number<offset_m_k>{}) +
+                            beta_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });

-        threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
-        threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k);
-        threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k);
-        threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k);
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                       make_tuple(I0, I0),
+                                       y_thread_buf(i),
+                                       y_grid_desc_m_k,
+                                       y_global_val_buf);

-        reducedTiles = 0;
-        do
+                if constexpr(i != ThreadBufferNumber - 1)
+                    threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k,
+                                                          thread_copy_fwd_step_m_k);
+            });
+        } // end of sweep once
+        else
        {
-            if constexpr(!SweepOnce)
+            for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
            {
-                threadwise_x_load.Run(x_grid_desc_m_k,
-                                      x_global_val_buf,
-                                      thread_buffer_desc_m_k,
-                                      make_tuple(I0, I0),
-                                      x_thread_buf);
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                    threadwise_x_load.Run(x_grid_desc_m_k,
+                                          x_global_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          x_thread_buf(i));
+                    threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+
+                    static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                        static_for<0, XSrcVectorSize, 1>{}([&](auto iK) {
+                            constexpr auto offset_m_k =
+                                thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                            x_square_thread_buf(i)(Number<offset_m_k>{}) =
+                                x_thread_buf(i)(Number<offset_m_k>{}) *
+                                x_thread_buf(i)(Number<offset_m_k>{});
+                        });
+                    });
+
+                    ThreadwiseSumReduce::Reduce(x_thread_buf[i], mean_thread_buf);
+                    ThreadwiseSumReduce::Reduce(x_square_thread_buf[i], mean_square_thread_buf);
+                });
            }

-            threadwise_gamma_load.Run(gamma_grid_desc_m_k,
-                                      gamma_global_val_buf,
-                                      thread_buffer_desc_m_k,
-                                      make_tuple(I0, I0),
-                                      gamma_thread_buf);
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();

-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset_m_k =
-                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
-
-                    // normalize
-                    y_thread_buf(Number<offset_m_k>{}) =
-                        (x_thread_buf(Number<offset_m_k>{}) - mean_thread_buf(iM)) /
-                        sqrt(var_thread_buf(iM) + epsilon);
-
-                    // gamma
-                    y_thread_buf(Number<offset_m_k>{}) =
-                        y_thread_buf(Number<offset_m_k>{}) * gamma_thread_buf(Number<offset_m_k>{});
-                });
-            });
+                BlockwiseSumReduce::Reduce(reduce_work_buf, mean_thread_buf(I));
+                mean_thread_buf(I) = mean_thread_buf(I) / reduce_length;

-            threadwise_beta_load.Run(beta_grid_desc_m_k,
-                                     beta_global_val_buf,
-                                     thread_buffer_desc_m_k,
-                                     make_tuple(I0, I0),
-                                     beta_thread_buf);
+                block_sync_lds();

-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset_m_k =
-                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                BlockwiseSumReduce::Reduce(reduce_work_buf, mean_square_thread_buf(I));
+                mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length;

-                    // beta
-                    y_thread_buf(Number<offset_m_k>{}) =
-                        y_thread_buf(Number<offset_m_k>{}) + beta_thread_buf(Number<offset_m_k>{});
-                });
+                // var(x) = E[x^2] - E[x]^2
+                var_thread_buf(I) =
+                    mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I));
            });

-            threadwise_y_store.Run(thread_buffer_desc_m_k,
-                                   make_tuple(I0, I0),
-                                   y_thread_buf,
-                                   y_grid_desc_m_k,
-                                   y_global_val_buf);
+            auto thread_copy_tail_m_k =
+                (num_k_block_tile_iteration - 1) * ThreadBufferNumber * thread_copy_fwd_step_m_k;

            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
-            threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_bwd_step_m_k);
-            threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_bwd_step_m_k);
-            threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k);
+            threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k);
+            threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k);
+
+            for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+            {
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                    threadwise_x_load.Run(x_grid_desc_m_k,
+                                          x_global_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          x_thread_buf(i));
+                    threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                });

-            ++reducedTiles;
-        } while(reducedTiles < num_k_block_tile_iteration);
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                    threadwise_gamma_load.Run(gamma_grid_desc_m_k,
+                                              gamma_global_val_buf,
+                                              thread_buffer_desc_m_k,
+                                              make_tuple(I0, I0),
+                                              gamma_thread_buf(i));
+
+                    threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                             thread_copy_fwd_step_m_k);
+                });
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
+                    static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
+                        static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                            constexpr auto offset_m_k =
+                                thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                            // normalize
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                                (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
+                                divisor;
+
+                            // gamma
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                                y_thread_buf(iK0)(Number<offset_m_k>{}) *
+                                gamma_thread_buf(iK0)(Number<offset_m_k>{});
+                        });
+                    });
+                });
+
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                    threadwise_beta_load.Run(beta_grid_desc_m_k,
+                                             beta_global_val_buf,
+                                             thread_buffer_desc_m_k,
+                                             make_tuple(I0, I0),
+                                             beta_thread_buf(i));
+                    threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                            thread_copy_fwd_step_m_k);
+                });
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
+                        static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                            constexpr auto offset_m_k =
+                                thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                            // beta
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                                y_thread_buf(iK0)(Number<offset_m_k>{}) +
+                                beta_thread_buf(iK0)(Number<offset_m_k>{});
+                        });
+                    });
+                });
+
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                    threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                           make_tuple(I0, I0),
+                                           y_thread_buf(i),
+                                           y_grid_desc_m_k,
+                                           y_global_val_buf);
+                    threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k,
+                                                          thread_copy_fwd_step_m_k);
+                });
+
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k);
+                threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                         2 * thread_copy_bwd_step_m_k);
+                threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                        2 * thread_copy_bwd_step_m_k);
+                threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k,
+                                                      2 * thread_copy_bwd_step_m_k);
+            }
+        } // end of sweep twice
    }
 };