Commit 9dce6851 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge develop

parents 3cc57101 5d37d7bf
......@@ -25,21 +25,30 @@ struct MagicDivision
// uint32_t
__host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
{
// assert(divisior >= 1 && divisior <= INT32_MAX);
uint32_t shift = 0;
for(shift = 0; shift < 32; ++shift)
// WARNING: magic division is only applicable for division inside this range.
// You should use the return value of CalculateMagicNumbers, if division is not inside this
// range. The "else" logic below is to quiet down run-time error.
if(divisor >= 1 && divisor <= INT32_MAX)
{
if((1U << shift) >= divisor)
uint32_t shift = 0;
for(shift = 0; shift < 32; ++shift)
{
break;
if((1U << shift) >= divisor)
{
break;
}
}
}
uint64_t one = 1;
uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
// assert(multiplier <= 0xffffffffUL);
uint64_t one = 1;
uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
// assert(multiplier <= 0xffffffffUL);
return make_tuple(uint32_t(multiplier), shift);
return make_tuple(uint32_t(multiplier), shift);
}
else
{
return make_tuple(uint32_t(0), uint32_t(0));
}
}
__host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
......
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include "data_type.hpp"
namespace ck {
namespace math {
static inline __device__ half_t abs(half_t x) { return __habs(x); };
static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
} // namespace math
} // namespace ck
#endif
......@@ -48,6 +48,18 @@ struct float_equal_zero
};
};
template <index_t N>
static constexpr __device__ index_t get_shift()
{
return (get_shift<N / 2>() + 1);
};
template <>
constexpr __device__ index_t get_shift<1>()
{
return (0);
}
}; // end of namespace ck
#endif
......@@ -34,50 +34,79 @@
namespace ck {
namespace detail {
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
template <typename T>
static inline __device__ bool is_nan(T x)
{
return (isnan(x));
};
template <>
inline __device__ bool is_nan<half_t>(half_t x)
{
return (__hisnan(x));
};
template <NanPropagation_t nanPropaOpt, typename opReduce, typename compType>
struct binop_with_nan_check;
template <bool PropagateNan, typename ReduceOperation, typename AccDataType>
struct AccumulateWithNanCheck;
template <typename opReduce, typename compType>
struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compType>
template <typename ReduceOperation, typename AccDataType>
struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
{
// cppcheck-suppress constParameter
__device__ static inline void calculate(compType& accuVal, compType currVal)
__device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
{
ReduceOperation{}(accuVal, currVal);
};
};
template <typename ReduceOperation, typename AccDataType>
struct AccumulateWithNanCheck<true, ReduceOperation, AccDataType>
{
__device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
{
opReduce{}(accuVal, currVal);
if(is_nan(currVal))
{
accuVal = currVal;
}
else
{
ReduceOperation{}(accuVal, currVal);
};
};
};
template <bool PropagateNan, typename ReduceOperation, typename AccDataType, typename IndexDataType>
struct AccumulateWithIndexAndNanCheck;
// The method is called when the opReduce is indexable and the user asked for indices
template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
struct AccumulateWithIndexAndNanCheck<false, ReduceOperation, AccDataType, IndexDataType>
{
__device__ static inline void
// cppcheck-suppress constParameter
calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
Calculate(AccDataType& accuVal,
AccDataType currVal,
IndexDataType& accuIndex,
IndexDataType currIndex)
{
bool changed = false;
opReduce{}(accuVal, currVal, changed);
ReduceOperation{}(accuVal, currVal, changed);
if(changed)
accuIndex = currIndex;
};
};
template <typename opReduce, typename compType>
struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
struct AccumulateWithIndexAndNanCheck<true, ReduceOperation, AccDataType, IndexDataType>
{
__device__ static inline void calculate(compType& accuVal, compType currVal)
{
if(isnan(currVal))
accuVal = currVal;
else
opReduce{}(accuVal, currVal);
};
// The method is called when the opReduce is indexable and the user asked for indices
__device__ static inline void
calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
// The method is called when the ReduceOperation is indexable and the user asked for indices
__device__ static inline void Calculate(AccDataType& accuVal,
AccDataType currVal,
IndexDataType& accuIndex,
IndexDataType currIndex)
{
if(isnan(currVal))
if(is_nan(currVal))
{
accuVal = currVal;
accuIndex = currIndex;
......@@ -86,7 +115,7 @@ struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
{
bool changed = false;
opReduce{}(accuVal, currVal, changed);
ReduceOperation{}(accuVal, currVal, changed);
if(changed)
accuIndex = currIndex;
......
......@@ -26,7 +26,7 @@
#ifndef CK_REDUCTION_OPERATOR_HPP
#define CK_REDUCTION_OPERATOR_HPP
#include "reduction_common.hpp"
#include "common_header.hpp"
namespace ck {
......@@ -60,11 +60,9 @@ struct Add
{
using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
__host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
__device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
static constexpr bool indexable = false;
__host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
};
template <class T>
......@@ -72,11 +70,9 @@ struct Mul
{
using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
__device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
__host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
static constexpr bool indexable = false;
__host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
};
template <class T>
......@@ -84,15 +80,18 @@ struct Max
{
using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Lowest(); };
__host__ __device__ static constexpr T GetReductionZeroVal()
{
return NumericLimits<T>::Lowest();
};
__device__ inline constexpr void operator()(T& a, T b) const
__host__ __device__ inline constexpr void operator()(T& a, T b) const
{
if(a < b)
a = b;
}
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const
__host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{
if(a < b)
{
......@@ -100,8 +99,6 @@ struct Max
changed = true;
}
}
static constexpr bool indexable = true;
};
template <class T>
......@@ -109,15 +106,18 @@ struct Min
{
using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Max(); };
__host__ __device__ static constexpr T GetReductionZeroVal()
{
return NumericLimits<T>::Max();
};
__device__ inline constexpr void operator()(T& a, T b) const
__host__ __device__ inline constexpr void operator()(T& a, T b) const
{
if(a > b)
a = b;
}
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const
__host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{
if(a > b)
{
......@@ -125,8 +125,6 @@ struct Min
changed = true;
}
}
static constexpr bool indexable = true;
};
template <class T>
......@@ -134,15 +132,15 @@ struct AMax
{
using dataType = T;
__device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
__host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
__device__ inline constexpr void operator()(T& a, T b) const
__host__ __device__ inline constexpr void operator()(T& a, T b) const
{
if(a < b)
a = b;
}
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const
__host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{
if(a < b)
{
......@@ -150,270 +148,10 @@ struct AMax
changed = true;
}
}
static constexpr bool indexable = true;
};
// Unary operators are usually called element-wisely before the reduction is executed on the
// elements.
// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template <class T, bool hasDividing>
struct unary_identic
{
__device__ unary_identic(const int divider = 1)
{
scaler = 1.0f / static_cast<float>(divider);
};
__device__ inline constexpr T operator()(T a) const { return a * type_convert<T>(scaler); };
float scaler = 1.0f;
};
template <class T>
struct unary_identic<T, false>
{
__device__ unary_identic(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return a; };
};
template <class T, bool hasDividing>
struct unary_square
{
__device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline constexpr T operator()(T a) const
{
a = a * a;
return a * type_convert<T>(scaler);
};
float scaler = 1.0f;
};
template <class T>
struct unary_square<T, false>
{
__device__ unary_square(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return a * a; };
};
template <class T, bool hasDividing>
struct unary_abs
{
__device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline constexpr T operator()(T a) const
{
a = abs(a);
return a * type_convert<T>(scaler);
};
float scaler = 1.0f;
};
template <class T>
struct unary_abs<T, false>
{
__device__ unary_abs(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return abs(a); };
};
// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
// Let's assume that __habs() exists since 3.5.
#if HIP_PACKAGE_VERSION_FLAT < 3005000000
inline __device__ __half __habs(__half x)
{
union
{
__half half;
unsigned short u16;
} val;
val.half = x;
val.u16 = val.u16 & 0x7fff;
return val.half;
}
#endif
template <bool hasDividing>
struct unary_abs<half_t, hasDividing>
{
__device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline half_t operator()(half_t a) const
{
a = static_cast<half_t>(__habs(a));
return a * type_convert<half_t>(scaler);
};
float scaler = 1.0f;
};
template <>
struct unary_abs<half_t, false>
{
__device__ unary_abs(const int divider = 1) { (void)divider; };
__device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(__habs(a)); };
};
template <class T>
struct unary_sqrt
{
__device__ unary_sqrt(const int divider = 1) { (void)divider; };
__device__ inline T operator()(T a) const { return sqrtf(a); };
};
template <>
struct unary_sqrt<half_t>
{
__device__ unary_sqrt(const int divider = 1) { (void)divider; };
__device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(hsqrt(a)); };
};
}; // end of namespace reduce
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
// respective functor classes.
// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in
// reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template <typename T, ReduceTensorOp_t op>
struct reduce_binary_operator;
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
{
using opType = reduce::Mul<T>;
using dataType = T;
static constexpr bool indexable = reduce::Mul<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
{
using opType = reduce::Min<T>;
using dataType = T;
static constexpr bool indexable = reduce::Min<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
{
using opType = reduce::Max<T>;
using dataType = T;
static constexpr bool indexable = reduce::Max<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
{
using opType = reduce::AMax<T>;
using dataType = T;
static constexpr bool indexable = reduce::Max<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = reduce::Add<T>::indexable;
};
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template <typename T, ReduceTensorOp_t op, bool isFirsReduce, bool isLastReduce>
struct reduce_unary_operator
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T, bool isFirstReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, isFirstReduce, true>
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_identic<T, true>;
};
template <typename T, bool isLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, isLastReduce>
{
using preUnaryOp = reduce::unary_abs<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T, bool isLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, isLastReduce>
{
using preUnaryOp = reduce::unary_abs<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
{
using preUnaryOp = reduce::unary_square<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
{
using preUnaryOp = reduce::unary_square<T, false>;
using posUnaryOp = reduce::unary_sqrt<T>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_sqrt<T>;
};
} // end of namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment