Move common expression 1/N out of the static_for loops

f4d67cf8 · Qianfeng Zhang · 7a190876 · f4d67cf8 · f4d67cf8
Commit f4d67cf8 authored Nov 03, 2022 by Qianfeng Zhang
2 changed files
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
@@ -473,6 +473,9 @@ struct GridwiseReduceSecondHalfBatchNormBackwardFinal
        constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize);
+        AccDataType inv_reduce_size =
+            type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size);
        for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles)
        {
            threadwise_x_load.Run(x_grid_desc_m_k,
@@ -488,9 +491,8 @@ struct GridwiseReduceSecondHalfBatchNormBackwardFinal
                                   dy_thread_buf);
            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                AccDataType multiplier = type_convert<AccDataType>(1.0) /
+                AccDataType multiplier =
-                                         type_convert<AccDataType>(reduce_size) *
+                    inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM];
-                                         inv_var_thread_buf[iM] * scale_thread_buf[iM];
                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
                    constexpr auto offset =

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
@@ -506,6 +506,9 @@ struct GridwiseBatchNormBackwardWithBlockwiseWelford
        threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k);
        threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_tail_m_k);
+        AccDataType inv_reduce_size =
+            type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size);
        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
        {
            threadwise_x_load.Run(x_grid_desc_m_k,
@@ -521,9 +524,8 @@ struct GridwiseBatchNormBackwardWithBlockwiseWelford
                                   dy_thread_buf);
            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                AccDataType multiplier = type_convert<AccDataType>(1.0) /
+                AccDataType multiplier =
-                                         type_convert<AccDataType>(reduce_size) *
+                    inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM];
-                                         inv_var_thread_buf[iM] * scale_thread_buf[iM];
                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
                    constexpr auto offset =