Rename GetZeroVal() to GetReductionZeroVal() in the kernels

4fea4251 · Qianfeng Zhang · 52ae56f8 · 4fea4251 · 4fea4251 · 4fea4251
Commit 4fea4251 authored Sep 17, 2021 by Qianfeng Zhang
7 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
@@ -92,7 +92,7 @@ struct GridwiseReduction_xy_to_x_blockwise
        // LDS
        __shared__ compType p_in_block_buffer[BlockBufferSize];

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -243,7 +243,7 @@ struct GridwiseReduction_xy_to_x_blockwise
        __shared__ compType p_in_block_buffer[BlockBufferSize];
        __shared__ int block_indices_buffer[BlockBufferSize];

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -431,7 +431,7 @@ struct GridwiseReduction_xy_to_x_blockwise
        __shared__ compType p_in_block_buffer[BlockBufferSize];
        __shared__ int block_indices_buffer[BlockBufferSize];

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_val_buf =
            make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,

--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
@@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
        (void)ws_indices_global;
        (void)indices_global;

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -204,7 +204,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
    {
        (void)ws_indices_global;

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -348,7 +348,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
    {
        (void)origReduceLen;

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_val_buf =
            make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,

--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
@@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
        (void)ws_indices_global;
        (void)indices_global;

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -215,7 +215,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
    {
        (void)ws_indices_global;

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -373,7 +373,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
    {
        (void)origReduceLen;

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        const auto src_global_val_buf =
            make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,

--- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
@@ -86,7 +86,7 @@ struct GridwiseReduction_xy_to_x_multiblock
        (void)alpha; // unused
        (void)beta;  // unused

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        // LDS
        __shared__ compType p_in_block_buffer[BlockBufferSize];
@@ -216,7 +216,7 @@ struct GridwiseReduction_xy_to_x_multiblock
        (void)alpha; // unused
        (void)beta;  // unused

-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();

        // LDS
        __shared__ compType p_in_block_values_buffer[BlockBufferSize];

--- a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
@@ -56,7 +56,7 @@ struct BlockwiseReduction_2d_block_buffer
    Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
    {
        const index_t thread_local_id = get_thread_local_1d_id();
-        compType lAccuData            = opReduce::GetZeroVal();
+        compType lAccuData            = opReduce::GetReductionZeroVal();

        index_t offset;
        for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
@@ -115,7 +115,7 @@ struct BlockwiseReduction_2d_block_buffer
                                   int& accuIndex)
    {
        const index_t thread_local_id = get_thread_local_1d_id();
-        compType lAccuData            = opReduce::GetZeroVal();
+        compType lAccuData            = opReduce::GetReductionZeroVal();
        int lAccuIndex                = 0;

        if constexpr(blockIsOneRow)

--- a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
+++ b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
@@ -62,7 +62,7 @@ struct WarpReduce
    // This interface implementation uses HIP built-in device shuffling functions
    __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData)
    {
-        compType lAccuData = opReduce::GetZeroVal();
+        compType lAccuData = opReduce::GetReductionZeroVal();

        static_for<0, ThreadBufferLen, 1>{}(
            [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
@@ -84,7 +84,7 @@ struct WarpReduce
    // since for fp16, built-in shuffling functions is not provided by HIP
    __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData)
    {
-        compType lAccuData = opReduce::GetZeroVal();
+        compType lAccuData = opReduce::GetReductionZeroVal();

        static_for<0, ThreadBufferLen, 1>{}(
            [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
@@ -138,7 +138,7 @@ struct WarpReduce
                                        int& accuIndex,
                                        int indexStart)
    {
-        compType lAccuData       = opReduce::GetZeroVal();
+        compType lAccuData       = opReduce::GetReductionZeroVal();
        int lAccuIndex           = 0;
        index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize;

@@ -170,7 +170,7 @@ struct WarpReduce
                                        int& accuIndex,
                                        int indexStart)
    {
-        compType lAccuData       = opReduce::GetZeroVal();
+        compType lAccuData       = opReduce::GetReductionZeroVal();
        int lAccuIndex           = 0;
        index_t thread_id        = get_thread_local_1d_id();
        index_t warpId           = thread_id / warpSize;
@@ -278,7 +278,7 @@ struct WarpReduceWithIndicesInput
                                       compType& accuData,
                                       int& accuIndex)
    {
-        compType lAccuData = opReduce::GetZeroVal();
+        compType lAccuData = opReduce::GetReductionZeroVal();
        int lAccuIndex     = 0;

        static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
@@ -307,7 +307,7 @@ struct WarpReduceWithIndicesInput
                                       compType& accuData,
                                       int& accuIndex)
    {
-        compType lAccuData       = opReduce::GetZeroVal();
+        compType lAccuData       = opReduce::GetReductionZeroVal();
        int lAccuIndex           = 0;
        index_t thread_id        = get_thread_local_1d_id();
        index_t warpId           = thread_id / warpSize;

--- a/composable_kernel/include/utility/reduction_operator.hpp
+++ b/composable_kernel/include/utility/reduction_operator.hpp
@@ -35,8 +35,8 @@ namespace reduce {
 // Every binary operator used in reduction is represented by a templated functor class. Each functor
 // class must provide at least
 // three members:
-// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator,
-// "identity element" is the unique
+// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary
+// operator, "identity element" is the unique
 //                    element in the algebraic space that doesn't affect the value of other elements
 //                    when operated with any of them.
 // 2) indexable -- boolean value indicating whether indices of the operated elements could be
@@ -58,7 +58,7 @@ struct Add
 {
    using dataType = T;

-    __device__ static constexpr T GetZeroVal() { return static_cast<T>(0.0f); };
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };

    __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }

@@ -70,7 +70,7 @@ struct Mul
 {
    using dataType = T;

-    __device__ static constexpr T GetZeroVal() { return static_cast<T>(1.0f); };
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };

    __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }

@@ -82,7 +82,7 @@ struct Max
 {
    using dataType = T;

-    __device__ static constexpr T GetZeroVal() { return NumericLimits<T>::lowest(); };
+    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::lowest(); };

    __device__ inline constexpr void operator()(T& a, T b) const
    {
@@ -107,7 +107,7 @@ struct Min
 {
    using dataType = T;

-    __device__ static constexpr T GetZeroVal() { return NumericLimits<T>::Max(); };
+    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Max(); };

    __device__ inline constexpr void operator()(T& a, T b) const
    {
@@ -132,7 +132,7 @@ struct AMax
 {
    using dataType = T;

-    __device__ static constexpr T GetZeroVal() { return static_cast<T>(0.0f); };
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };

    __device__ inline constexpr void operator()(T& a, T b) const
    {
@@ -281,7 +281,7 @@ struct unary_sqrt<half_t>

 // The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
 // respective functor classes.
-// The "GetZeroVal()" interface and boolean member "indexable" are also provided in
+// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in
 // reduce_binary_operactor for
 // easier checking by the upper-layer codes in the kernels.