Commit 9dce6851 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge develop

parents 3cc57101 5d37d7bf
...@@ -25,21 +25,30 @@ struct MagicDivision ...@@ -25,21 +25,30 @@ struct MagicDivision
// uint32_t // uint32_t
__host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor) __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
{ {
// assert(divisior >= 1 && divisior <= INT32_MAX); // WARNING: magic division is only applicable for division inside this range.
uint32_t shift = 0; // You should use the return value of CalculateMagicNumbers, if division is not inside this
for(shift = 0; shift < 32; ++shift) // range. The "else" logic below is to quiet down run-time error.
if(divisor >= 1 && divisor <= INT32_MAX)
{ {
if((1U << shift) >= divisor) uint32_t shift = 0;
for(shift = 0; shift < 32; ++shift)
{ {
break; if((1U << shift) >= divisor)
{
break;
}
} }
}
uint64_t one = 1; uint64_t one = 1;
uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1; uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
// assert(multiplier <= 0xffffffffUL); // assert(multiplier <= 0xffffffffUL);
return make_tuple(uint32_t(multiplier), shift); return make_tuple(uint32_t(multiplier), shift);
}
else
{
return make_tuple(uint32_t(0), uint32_t(0));
}
} }
__host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor) __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
......
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include "data_type.hpp"
namespace ck {
namespace math {
static inline __device__ half_t abs(half_t x) { return __habs(x); };
static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
} // namespace math
} // namespace ck
#endif
...@@ -48,6 +48,18 @@ struct float_equal_zero ...@@ -48,6 +48,18 @@ struct float_equal_zero
}; };
}; };
template <index_t N>
static constexpr __device__ index_t get_shift()
{
return (get_shift<N / 2>() + 1);
};
template <>
constexpr __device__ index_t get_shift<1>()
{
return (0);
}
}; // end of namespace ck }; // end of namespace ck
#endif #endif
...@@ -34,50 +34,79 @@ ...@@ -34,50 +34,79 @@
namespace ck { namespace ck {
namespace detail { namespace detail {
static inline __device__ bool isnan(half_t x) { return __hisnan(x); }; template <typename T>
static inline __device__ bool is_nan(T x)
{
return (isnan(x));
};
template <>
inline __device__ bool is_nan<half_t>(half_t x)
{
return (__hisnan(x));
};
template <NanPropagation_t nanPropaOpt, typename opReduce, typename compType> template <bool PropagateNan, typename ReduceOperation, typename AccDataType>
struct binop_with_nan_check; struct AccumulateWithNanCheck;
template <typename opReduce, typename compType> template <typename ReduceOperation, typename AccDataType>
struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compType> struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
{ {
// cppcheck-suppress constParameter // cppcheck-suppress constParameter
__device__ static inline void calculate(compType& accuVal, compType currVal) __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
{
ReduceOperation{}(accuVal, currVal);
};
};
template <typename ReduceOperation, typename AccDataType>
struct AccumulateWithNanCheck<true, ReduceOperation, AccDataType>
{
__device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
{ {
opReduce{}(accuVal, currVal); if(is_nan(currVal))
{
accuVal = currVal;
}
else
{
ReduceOperation{}(accuVal, currVal);
};
}; };
};
template <bool PropagateNan, typename ReduceOperation, typename AccDataType, typename IndexDataType>
struct AccumulateWithIndexAndNanCheck;
// The method is called when the opReduce is indexable and the user asked for indices template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
struct AccumulateWithIndexAndNanCheck<false, ReduceOperation, AccDataType, IndexDataType>
{
__device__ static inline void __device__ static inline void
// cppcheck-suppress constParameter // cppcheck-suppress constParameter
calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex) Calculate(AccDataType& accuVal,
AccDataType currVal,
IndexDataType& accuIndex,
IndexDataType currIndex)
{ {
bool changed = false; bool changed = false;
opReduce{}(accuVal, currVal, changed); ReduceOperation{}(accuVal, currVal, changed);
if(changed) if(changed)
accuIndex = currIndex; accuIndex = currIndex;
}; };
}; };
template <typename opReduce, typename compType> template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType> struct AccumulateWithIndexAndNanCheck<true, ReduceOperation, AccDataType, IndexDataType>
{ {
__device__ static inline void calculate(compType& accuVal, compType currVal) // The method is called when the ReduceOperation is indexable and the user asked for indices
{ __device__ static inline void Calculate(AccDataType& accuVal,
if(isnan(currVal)) AccDataType currVal,
accuVal = currVal; IndexDataType& accuIndex,
else IndexDataType currIndex)
opReduce{}(accuVal, currVal);
};
// The method is called when the opReduce is indexable and the user asked for indices
__device__ static inline void
calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
{ {
if(isnan(currVal)) if(is_nan(currVal))
{ {
accuVal = currVal; accuVal = currVal;
accuIndex = currIndex; accuIndex = currIndex;
...@@ -86,7 +115,7 @@ struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType> ...@@ -86,7 +115,7 @@ struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
{ {
bool changed = false; bool changed = false;
opReduce{}(accuVal, currVal, changed); ReduceOperation{}(accuVal, currVal, changed);
if(changed) if(changed)
accuIndex = currIndex; accuIndex = currIndex;
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#ifndef CK_REDUCTION_OPERATOR_HPP #ifndef CK_REDUCTION_OPERATOR_HPP
#define CK_REDUCTION_OPERATOR_HPP #define CK_REDUCTION_OPERATOR_HPP
#include "reduction_common.hpp" #include "common_header.hpp"
namespace ck { namespace ck {
...@@ -60,11 +60,9 @@ struct Add ...@@ -60,11 +60,9 @@ struct Add
{ {
using dataType = T; using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); }; __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
__device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
static constexpr bool indexable = false;
}; };
template <class T> template <class T>
...@@ -72,11 +70,9 @@ struct Mul ...@@ -72,11 +70,9 @@ struct Mul
{ {
using dataType = T; using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); }; __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
__device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
static constexpr bool indexable = false; __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
}; };
template <class T> template <class T>
...@@ -84,15 +80,18 @@ struct Max ...@@ -84,15 +80,18 @@ struct Max
{ {
using dataType = T; using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Lowest(); }; __host__ __device__ static constexpr T GetReductionZeroVal()
{
return NumericLimits<T>::Lowest();
};
__device__ inline constexpr void operator()(T& a, T b) const __host__ __device__ inline constexpr void operator()(T& a, T b) const
{ {
if(a < b) if(a < b)
a = b; a = b;
} }
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{ {
if(a < b) if(a < b)
{ {
...@@ -100,8 +99,6 @@ struct Max ...@@ -100,8 +99,6 @@ struct Max
changed = true; changed = true;
} }
} }
static constexpr bool indexable = true;
}; };
template <class T> template <class T>
...@@ -109,15 +106,18 @@ struct Min ...@@ -109,15 +106,18 @@ struct Min
{ {
using dataType = T; using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Max(); }; __host__ __device__ static constexpr T GetReductionZeroVal()
{
return NumericLimits<T>::Max();
};
__device__ inline constexpr void operator()(T& a, T b) const __host__ __device__ inline constexpr void operator()(T& a, T b) const
{ {
if(a > b) if(a > b)
a = b; a = b;
} }
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{ {
if(a > b) if(a > b)
{ {
...@@ -125,8 +125,6 @@ struct Min ...@@ -125,8 +125,6 @@ struct Min
changed = true; changed = true;
} }
} }
static constexpr bool indexable = true;
}; };
template <class T> template <class T>
...@@ -134,15 +132,15 @@ struct AMax ...@@ -134,15 +132,15 @@ struct AMax
{ {
using dataType = T; using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); }; __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
__device__ inline constexpr void operator()(T& a, T b) const __host__ __device__ inline constexpr void operator()(T& a, T b) const
{ {
if(a < b) if(a < b)
a = b; a = b;
} }
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{ {
if(a < b) if(a < b)
{ {
...@@ -150,270 +148,10 @@ struct AMax ...@@ -150,270 +148,10 @@ struct AMax
changed = true; changed = true;
} }
} }
static constexpr bool indexable = true;
};
// Unary operators are usually called element-wisely before the reduction is executed on the
// elements.
// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template <class T, bool hasDividing>
struct unary_identic
{
__device__ unary_identic(const int divider = 1)
{
scaler = 1.0f / static_cast<float>(divider);
};
__device__ inline constexpr T operator()(T a) const { return a * type_convert<T>(scaler); };
float scaler = 1.0f;
};
template <class T>
struct unary_identic<T, false>
{
__device__ unary_identic(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return a; };
};
template <class T, bool hasDividing>
struct unary_square
{
__device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline constexpr T operator()(T a) const
{
a = a * a;
return a * type_convert<T>(scaler);
};
float scaler = 1.0f;
};
template <class T>
struct unary_square<T, false>
{
__device__ unary_square(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return a * a; };
};
template <class T, bool hasDividing>
struct unary_abs
{
__device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline constexpr T operator()(T a) const
{
a = abs(a);
return a * type_convert<T>(scaler);
};
float scaler = 1.0f;
};
template <class T>
struct unary_abs<T, false>
{
__device__ unary_abs(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return abs(a); };
};
// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
// Let's assume that __habs() exists since 3.5.
#if HIP_PACKAGE_VERSION_FLAT < 3005000000
inline __device__ __half __habs(__half x)
{
union
{
__half half;
unsigned short u16;
} val;
val.half = x;
val.u16 = val.u16 & 0x7fff;
return val.half;
}
#endif
template <bool hasDividing>
struct unary_abs<half_t, hasDividing>
{
__device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline half_t operator()(half_t a) const
{
a = static_cast<half_t>(__habs(a));
return a * type_convert<half_t>(scaler);
};
float scaler = 1.0f;
};
template <>
struct unary_abs<half_t, false>
{
__device__ unary_abs(const int divider = 1) { (void)divider; };
__device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(__habs(a)); };
};
template <class T>
struct unary_sqrt
{
__device__ unary_sqrt(const int divider = 1) { (void)divider; };
__device__ inline T operator()(T a) const { return sqrtf(a); };
};
template <>
struct unary_sqrt<half_t>
{
__device__ unary_sqrt(const int divider = 1) { (void)divider; };
__device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(hsqrt(a)); };
}; };
}; // end of namespace reduce }; // end of namespace reduce
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
// respective functor classes.
// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in
// reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template <typename T, ReduceTensorOp_t op>
struct reduce_binary_operator;
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
{
using opType = reduce::Mul<T>;
using dataType = T;
static constexpr bool indexable = reduce::Mul<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
{
using opType = reduce::Min<T>;
using dataType = T;
static constexpr bool indexable = reduce::Min<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
{
using opType = reduce::Max<T>;
using dataType = T;
static constexpr bool indexable = reduce::Max<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
{
using opType = reduce::AMax<T>;
using dataType = T;
static constexpr bool indexable = reduce::Max<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = reduce::Add<T>::indexable;
};
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template <typename T, ReduceTensorOp_t op, bool isFirsReduce, bool isLastReduce>
struct reduce_unary_operator
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T, bool isFirstReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, isFirstReduce, true>
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_identic<T, true>;
};
template <typename T, bool isLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, isLastReduce>
{
using preUnaryOp = reduce::unary_abs<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T, bool isLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, isLastReduce>
{
using preUnaryOp = reduce::unary_abs<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
{
using preUnaryOp = reduce::unary_square<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
{
using preUnaryOp = reduce::unary_square<T, false>;
using posUnaryOp = reduce::unary_sqrt<T>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_sqrt<T>;
};
} // end of namespace ck } // end of namespace ck
#endif #endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment