merge develop

9dce6851 · Jing Zhang · 3cc57101 · 5d37d7bf · 9dce6851 · 9dce6851
Commit 9dce6851 authored Mar 10, 2022 by Jing Zhang
20 changed files
--- a/composable_kernel/include/utility/ignore.hpp
+++ b/composable_kernel/include/utility/ignore.hpp
--- a/composable_kernel/include/utility/inner_product.hpp
+++ b/composable_kernel/include/utility/inner_product.hpp
--- a/composable_kernel/include/utility/integral_constant.hpp
+++ b/composable_kernel/include/utility/integral_constant.hpp
--- a/composable_kernel/include/utility/is_known_at_compile_time.hpp
+++ b/composable_kernel/include/utility/is_known_at_compile_time.hpp
--- a/composable_kernel/include/utility/magic_division.hpp
+++ b/composable_kernel/include/utility/magic_division.hpp
@@ -25,21 +25,30 @@ struct MagicDivision
    // uint32_t
    __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
    {
-        // assert(divisior >= 1 && divisior <= INT32_MAX);
-        uint32_t shift = 0;
-        for(shift = 0; shift < 32; ++shift)
+        // WARNING: magic division is only applicable for division inside this range.
+        // You should use the return value of CalculateMagicNumbers, if division is not inside this
+        // range. The "else" logic below is to quiet down run-time error.
+        if(divisor >= 1 && divisor <= INT32_MAX)
        {
-            if((1U << shift) >= divisor)
+            uint32_t shift = 0;
+            for(shift = 0; shift < 32; ++shift)
            {
-                break;
+                if((1U << shift) >= divisor)
+                {
+                    break;
+                }
            }
-        }

-        uint64_t one        = 1;
-        uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
-        // assert(multiplier <= 0xffffffffUL);
+            uint64_t one        = 1;
+            uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+            // assert(multiplier <= 0xffffffffUL);

-        return make_tuple(uint32_t(multiplier), shift);
+            return make_tuple(uint32_t(multiplier), shift);
+        }
+        else
+        {
+            return make_tuple(uint32_t(0), uint32_t(0));
+        }
    }

    __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)

--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
+#ifndef CK_MATH_V2_HPP
+#define CK_MATH_V2_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+namespace math {
+
+static inline __device__ half_t abs(half_t x) { return __habs(x); };
+static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
+static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+
+} // namespace math
+} // namespace ck
+
+#endif
--- a/composable_kernel/include/utility/multi_index.hpp
+++ b/composable_kernel/include/utility/multi_index.hpp
--- a/composable_kernel/include/utility/number.hpp
+++ b/composable_kernel/include/utility/number.hpp
--- a/composable_kernel/include/utility/print.hpp
+++ b/composable_kernel/include/utility/print.hpp
--- a/composable_kernel/include/utility/reduction_common.hpp
+++ b/composable_kernel/include/utility/reduction_common.hpp
@@ -48,6 +48,18 @@ struct float_equal_zero
    };
 };

+template <index_t N>
+static constexpr __device__ index_t get_shift()
+{
+    return (get_shift<N / 2>() + 1);
+};
+
+template <>
+constexpr __device__ index_t get_shift<1>()
+{
+    return (0);
+}
+
 }; // end of namespace ck

 #endif
--- a/composable_kernel/include/utility/reduction_enums.hpp
+++ b/composable_kernel/include/utility/reduction_enums.hpp
--- a/composable_kernel/include/utility/reduction_functions_binop.hpp
+++ b/composable_kernel/include/utility/reduction_functions_binop.hpp
@@ -34,50 +34,79 @@
 namespace ck {
 namespace detail {

-static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+template <typename T>
+static inline __device__ bool is_nan(T x)
+{
+    return (isnan(x));
+};
+
+template <>
+inline __device__ bool is_nan<half_t>(half_t x)
+{
+    return (__hisnan(x));
+};

-template <NanPropagation_t nanPropaOpt, typename opReduce, typename compType>
-struct binop_with_nan_check;
+template <bool PropagateNan, typename ReduceOperation, typename AccDataType>
+struct AccumulateWithNanCheck;

-template <typename opReduce, typename compType>
-struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compType>
+template <typename ReduceOperation, typename AccDataType>
+struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
 {
    // cppcheck-suppress constParameter
-    __device__ static inline void calculate(compType& accuVal, compType currVal)
+    __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
+    {
+        ReduceOperation{}(accuVal, currVal);
+    };
+};
+
+template <typename ReduceOperation, typename AccDataType>
+struct AccumulateWithNanCheck<true, ReduceOperation, AccDataType>
+{
+    __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
    {
-        opReduce{}(accuVal, currVal);
+        if(is_nan(currVal))
+        {
+            accuVal = currVal;
+        }
+        else
+        {
+            ReduceOperation{}(accuVal, currVal);
+        };
    };
+};
+
+template <bool PropagateNan, typename ReduceOperation, typename AccDataType, typename IndexDataType>
+struct AccumulateWithIndexAndNanCheck;

-    // The method is called when the opReduce is indexable and the user asked for indices
+template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
+struct AccumulateWithIndexAndNanCheck<false, ReduceOperation, AccDataType, IndexDataType>
+{
    __device__ static inline void
    // cppcheck-suppress constParameter
-    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    Calculate(AccDataType& accuVal,
+              AccDataType currVal,
+              IndexDataType& accuIndex,
+              IndexDataType currIndex)
    {
        bool changed = false;

-        opReduce{}(accuVal, currVal, changed);
+        ReduceOperation{}(accuVal, currVal, changed);

        if(changed)
            accuIndex = currIndex;
    };
 };

-template <typename opReduce, typename compType>
-struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
+template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
+struct AccumulateWithIndexAndNanCheck<true, ReduceOperation, AccDataType, IndexDataType>
 {
-    __device__ static inline void calculate(compType& accuVal, compType currVal)
-    {
-        if(isnan(currVal))
-            accuVal = currVal;
-        else
-            opReduce{}(accuVal, currVal);
-    };
-
-    // The method is called when the opReduce is indexable and the user asked for indices
-    __device__ static inline void
-    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    // The method is called when the ReduceOperation is indexable and the user asked for indices
+    __device__ static inline void Calculate(AccDataType& accuVal,
+                                            AccDataType currVal,
+                                            IndexDataType& accuIndex,
+                                            IndexDataType currIndex)
    {
-        if(isnan(currVal))
+        if(is_nan(currVal))
        {
            accuVal   = currVal;
            accuIndex = currIndex;
@@ -86,7 +115,7 @@ struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
        {
            bool changed = false;

-            opReduce{}(accuVal, currVal, changed);
+            ReduceOperation{}(accuVal, currVal, changed);

            if(changed)
                accuIndex = currIndex;

--- a/composable_kernel/include/utility/reduction_operator.hpp
+++ b/composable_kernel/include/utility/reduction_operator.hpp
@@ -26,7 +26,7 @@
 #ifndef CK_REDUCTION_OPERATOR_HPP
 #define CK_REDUCTION_OPERATOR_HPP

-#include "reduction_common.hpp"
+#include "common_header.hpp"

 namespace ck {

@@ -60,11 +60,9 @@ struct Add
 {
    using dataType = T;

-    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
+    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };

-    __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
-
-    static constexpr bool indexable = false;
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
 };

 template <class T>
@@ -72,11 +70,9 @@ struct Mul
 {
    using dataType = T;

-    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
-
-    __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
+    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };

-    static constexpr bool indexable = false;
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
 };

 template <class T>
@@ -84,15 +80,18 @@ struct Max
 {
    using dataType = T;

-    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Lowest(); };
+    __host__ __device__ static constexpr T GetReductionZeroVal()
+    {
+        return NumericLimits<T>::Lowest();
+    };

-    __device__ inline constexpr void operator()(T& a, T b) const
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
    {
        if(a < b)
            a = b;
    }

-    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
    {
        if(a < b)
        {
@@ -100,8 +99,6 @@ struct Max
            changed = true;
        }
    }
-
-    static constexpr bool indexable = true;
 };

 template <class T>
@@ -109,15 +106,18 @@ struct Min
 {
    using dataType = T;

-    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Max(); };
+    __host__ __device__ static constexpr T GetReductionZeroVal()
+    {
+        return NumericLimits<T>::Max();
+    };

-    __device__ inline constexpr void operator()(T& a, T b) const
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
    {
        if(a > b)
            a = b;
    }

-    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
    {
        if(a > b)
        {
@@ -125,8 +125,6 @@ struct Min
            changed = true;
        }
    }
-
-    static constexpr bool indexable = true;
 };

 template <class T>
@@ -134,15 +132,15 @@ struct AMax
 {
    using dataType = T;

-    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
+    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };

-    __device__ inline constexpr void operator()(T& a, T b) const
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
    {
        if(a < b)
            a = b;
    }

-    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
    {
        if(a < b)
        {
@@ -150,270 +148,10 @@ struct AMax
            changed = true;
        }
    }
-
-    static constexpr bool indexable = true;
-};
-
-// Unary operators are usually called element-wisely before the reduction is executed on the
-// elements.
-// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
-template <class T, bool hasDividing>
-struct unary_identic
-{
-    __device__ unary_identic(const int divider = 1)
-    {
-        scaler = 1.0f / static_cast<float>(divider);
-    };
-
-    __device__ inline constexpr T operator()(T a) const { return a * type_convert<T>(scaler); };
-
-    float scaler = 1.0f;
-};
-
-template <class T>
-struct unary_identic<T, false>
-{
-    __device__ unary_identic(const int divider = 1) { (void)divider; };
-
-    __device__ inline constexpr T operator()(T a) const { return a; };
-};
-
-template <class T, bool hasDividing>
-struct unary_square
-{
-    __device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
-
-    __device__ inline constexpr T operator()(T a) const
-    {
-        a = a * a;
-
-        return a * type_convert<T>(scaler);
-    };
-
-    float scaler = 1.0f;
-};
-
-template <class T>
-struct unary_square<T, false>
-{
-    __device__ unary_square(const int divider = 1) { (void)divider; };
-
-    __device__ inline constexpr T operator()(T a) const { return a * a; };
-};
-
-template <class T, bool hasDividing>
-struct unary_abs
-{
-    __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
-
-    __device__ inline constexpr T operator()(T a) const
-    {
-        a = abs(a);
-
-        return a * type_convert<T>(scaler);
-    };
-
-    float scaler = 1.0f;
-};
-
-template <class T>
-struct unary_abs<T, false>
-{
-    __device__ unary_abs(const int divider = 1) { (void)divider; };
-
-    __device__ inline constexpr T operator()(T a) const { return abs(a); };
-};
-
-// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
-// Let's assume that __habs() exists since 3.5.
-#if HIP_PACKAGE_VERSION_FLAT < 3005000000
-inline __device__ __half __habs(__half x)
-{
-    union
-    {
-        __half half;
-        unsigned short u16;
-    } val;
-    val.half = x;
-    val.u16  = val.u16 & 0x7fff;
-    return val.half;
-}
-#endif
-
-template <bool hasDividing>
-struct unary_abs<half_t, hasDividing>
-{
-    __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
-
-    __device__ inline half_t operator()(half_t a) const
-    {
-        a = static_cast<half_t>(__habs(a));
-
-        return a * type_convert<half_t>(scaler);
-    };
-
-    float scaler = 1.0f;
-};
-
-template <>
-struct unary_abs<half_t, false>
-{
-    __device__ unary_abs(const int divider = 1) { (void)divider; };
-
-    __device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(__habs(a)); };
-};
-
-template <class T>
-struct unary_sqrt
-{
-    __device__ unary_sqrt(const int divider = 1) { (void)divider; };
-
-    __device__ inline T operator()(T a) const { return sqrtf(a); };
-};
-
-template <>
-struct unary_sqrt<half_t>
-{
-    __device__ unary_sqrt(const int divider = 1) { (void)divider; };
-
-    __device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(hsqrt(a)); };
 };

 }; // end of namespace reduce

-// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
-// respective functor classes.
-// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in
-// reduce_binary_operactor for
-// easier checking by the upper-layer codes in the kernels.
-
-template <typename T, ReduceTensorOp_t op>
-struct reduce_binary_operator;
-
-template <typename T>
-struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
-{
-    using opType   = reduce::Add<T>;
-    using dataType = T;
-
-    static constexpr bool indexable = reduce::Add<T>::indexable;
-};
-
-template <typename T>
-struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
-{
-    using opType   = reduce::Mul<T>;
-    using dataType = T;
-
-    static constexpr bool indexable = reduce::Mul<T>::indexable;
-};
-
-template <typename T>
-struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
-{
-    using opType   = reduce::Min<T>;
-    using dataType = T;
-
-    static constexpr bool indexable = reduce::Min<T>::indexable;
-};
-
-template <typename T>
-struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
-{
-    using opType   = reduce::Max<T>;
-    using dataType = T;
-
-    static constexpr bool indexable = reduce::Max<T>::indexable;
-};
-
-template <typename T>
-struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
-{
-    using opType   = reduce::AMax<T>;
-    using dataType = T;
-
-    static constexpr bool indexable = reduce::Max<T>::indexable;
-};
-
-template <typename T>
-struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
-{
-    using opType   = reduce::Add<T>;
-    using dataType = T;
-
-    static constexpr bool indexable = reduce::Add<T>::indexable;
-};
-
-template <typename T>
-struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
-{
-    using opType   = reduce::Add<T>;
-    using dataType = T;
-
-    static constexpr bool indexable = reduce::Add<T>::indexable;
-};
-
-template <typename T>
-struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
-{
-    using opType   = reduce::Add<T>;
-    using dataType = T;
-
-    static constexpr bool indexable = reduce::Add<T>::indexable;
-};
-
-// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
-// functor classes.
-// The two unary functors are called before and afer the Reduction is executed respectively
-template <typename T, ReduceTensorOp_t op, bool isFirsReduce, bool isLastReduce>
-struct reduce_unary_operator
-{
-    using preUnaryOp = reduce::unary_identic<T, false>;
-    using posUnaryOp = reduce::unary_identic<T, false>;
-};
-
-template <typename T, bool isFirstReduce>
-struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, isFirstReduce, true>
-{
-    using preUnaryOp = reduce::unary_identic<T, false>;
-    using posUnaryOp = reduce::unary_identic<T, true>;
-};
-
-template <typename T, bool isLastReduce>
-struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, isLastReduce>
-{
-    using preUnaryOp = reduce::unary_abs<T, false>;
-    using posUnaryOp = reduce::unary_identic<T, false>;
-};
-
-template <typename T, bool isLastReduce>
-struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, isLastReduce>
-{
-    using preUnaryOp = reduce::unary_abs<T, false>;
-    using posUnaryOp = reduce::unary_identic<T, false>;
-};
-
-template <typename T>
-struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
-{
-    using preUnaryOp = reduce::unary_square<T, false>;
-    using posUnaryOp = reduce::unary_identic<T, false>;
-};
-
-template <typename T>
-struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
-{
-    using preUnaryOp = reduce::unary_square<T, false>;
-    using posUnaryOp = reduce::unary_sqrt<T>;
-};
-
-template <typename T>
-struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
-{
-    using preUnaryOp = reduce::unary_identic<T, false>;
-    using posUnaryOp = reduce::unary_sqrt<T>;
-};
-
 } // end of namespace ck

 #endif
--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
--- a/composable_kernel/include/utility/sequence_helper.hpp
+++ b/composable_kernel/include/utility/sequence_helper.hpp
--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
--- a/composable_kernel/include/utility/statically_indexed_array.hpp
+++ b/composable_kernel/include/utility/statically_indexed_array.hpp
--- a/composable_kernel/include/utility/statically_indexed_array_multi_index.hpp
+++ b/composable_kernel/include/utility/statically_indexed_array_multi_index.hpp
--- a/composable_kernel/include/utility/synchronization.hpp
+++ b/composable_kernel/include/utility/synchronization.hpp