Improve the expression calculation for performance

d5728dd3 · Qianfeng Zhang · de6aad06 · d5728dd3
Commit d5728dd3 authored Oct 31, 2022 by Qianfeng Zhang
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp ...pu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp +5 -2

No files found.
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
@@ -508,6 +508,10 @@ struct GridwiseBatchNormBackwardWithBlockwiseWelford
                                   dy_thread_buf);
            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                AccDataType multiplier = type_convert<AccDataType>(1.0) /
+                                         type_convert<AccDataType>(reduce_size) *
+                                         inv_var_thread_buf[iM] * scale_thread_buf[iM];
                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
                    constexpr auto offset =
                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
@@ -518,8 +522,7 @@ struct GridwiseBatchNormBackwardWithBlockwiseWelford
                    AccDataType tmpVal = norm_x * dscale_thread_buf[iM];
                    dx_thread_buf(Number<offset>{}) =
-                        type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size) *
+                        multiplier *
-                        inv_var_thread_buf[iM] * scale_thread_buf[iM] *
                        (type_convert<AccDataType>(reduce_size) * dy_thread_buf[Number<offset>{}] -
                         dbias_thread_buf[iM] - tmpVal);
                });