#pragma once #define DIV_CELL(a, b) (((a) + (b) - 1) / (b)) #if __cplusplus >= 201703L #define IF_CONSTEXPR constexpr #else #define IF_CONSTEXPR #endif // swz #ifdef __HIP_PLATFORM_HCC__ #include #if defined(__HIPCC_RTC__) #define __HOST_DEVICE__ __device__ #else #define __HOST_DEVICE__ __host__ __device__ // TODO: Clang has a bug which allows device functions to call std functions // when std functions are introduced into default namespace by using statement. // math.h may be included after this bug is fixed. #if __cplusplus #include #else #include "math.h" #endif #endif // !defined(__HIPCC_RTC__) struct hip_bfloat162 { hip_bfloat16 x; hip_bfloat16 y; public: __HOST_DEVICE__ hip_bfloat162() = default; __HOST_DEVICE__ hip_bfloat162(const hip_bfloat16& in1, const hip_bfloat16& in2):x{in1},y{in2} {} __HOST_DEVICE__ hip_bfloat162& operator =(const hip_bfloat162& x) { this->x = hip_bfloat16(float(x.x)); this->y = hip_bfloat16(float(x.y)); return *this; } }; #endif template __device__ __forceinline__ T SHFL_XOR(T value, int laneMask, int width, unsigned int mask = 0xffffffff) { #if CUDA_VERSION >= 9000&& !defined(__HIP_PLATFORM_HCC__) return __shfl_xor_sync(mask, value, laneMask, width); #else return __shfl_xor(value, laneMask, width); #endif } template struct VecTypeImpl; #define DEFINE_VEC_TYPE(t, n, tn) \ template <> \ struct VecTypeImpl { \ using type = tn; \ }; DEFINE_VEC_TYPE(half, 1, half) DEFINE_VEC_TYPE(__nv_bfloat16, 1, __nv_bfloat16) DEFINE_VEC_TYPE(float, 1, float) DEFINE_VEC_TYPE(half, 2, half2) #ifdef __HIP_PLATFORM_HCC__ DEFINE_VEC_TYPE(__nv_bfloat16, 2, hip_bfloat162) #else DEFINE_VEC_TYPE(__nv_bfloat16, 2, __nv_bfloat162) #endif DEFINE_VEC_TYPE(float, 2, float2) DEFINE_VEC_TYPE(half, 4, uint64_t) DEFINE_VEC_TYPE(__nv_bfloat16, 4, uint64_t) DEFINE_VEC_TYPE(float, 4, float4) template using VecType = typename VecTypeImpl::type;