/*! * Copyright (c) 2019 by Contributors * \file array/cuda/atomic.cuh * \brief Atomic functions */ #ifndef DGL_ARRAY_CUDA_ATOMIC_H_ #define DGL_ARRAY_CUDA_ATOMIC_H_ #include #if __CUDA_ARCH__ >= 600 #include #endif namespace dgl { namespace aten { namespace cuda { // Type trait for selecting code type template struct Code { }; template <> struct Code<4> { typedef unsigned int Type; }; template <> struct Code<8> { typedef unsigned long long int Type; }; // Helper class for converting to/from atomicCAS compatible types. template struct Cast { typedef typename Code::Type Type; static __device__ __forceinline__ Type Encode(T val) { return static_cast(val); } static __device__ __forceinline__ T Decode(Type code) { return static_cast(code); } }; template <> struct Cast { typedef Code::Type Type; static __device__ __forceinline__ Type Encode(float val) { return __float_as_uint(val); } static __device__ __forceinline__ float Decode(Type code) { return __uint_as_float(code); } }; template <> struct Cast { typedef Code::Type Type; static __device__ __forceinline__ Type Encode(double val) { return __double_as_longlong(val); } static __device__ __forceinline__ double Decode(Type code) { return __longlong_as_double(code); } }; #define DEFINE_ATOMIC(NAME) \ template \ __device__ __forceinline__ T Atomic##NAME(T* addr, T val) { \ typedef typename Cast::Type CT; \ CT* addr_as_ui = reinterpret_cast(addr); \ CT old = *addr_as_ui; \ CT assumed = old; \ do { \ assumed = old; \ old = atomicCAS(addr_as_ui, assumed, \ Cast::Encode(OP(val, Cast::Decode(old)))); \ } while (assumed != old); \ return Cast::Decode(old); \ } #define OP(a, b) max(a, b) DEFINE_ATOMIC(Max) #undef OP #define OP(a, b) min(a, b) DEFINE_ATOMIC(Min) #undef OP #define OP(a, b) a + b DEFINE_ATOMIC(Add) #undef OP #if __CUDA_ARCH__ >= 200 template <> __device__ __forceinline__ float AtomicAdd(float* addr, float val) { return atomicAdd(addr, val); } #endif // __CUDA_ARCH__ #if __CUDA_ARCH__ >= 600 template <> __device__ __forceinline__ double AtomicAdd(double* addr, double val) { return atomicAdd(addr, val); } #endif #if defined(CUDART_VERSION) && CUDART_VERSION >= 10000 #if __CUDA_ARCH__ >= 600 template <> __device__ __forceinline__ __half2 AtomicAdd<__half2>(__half2* addr, __half2 val) { return atomicAdd(addr, val); } #endif // __CUDA_ARCH__ #if __CUDA_ARCH__ >= 700 template <> __device__ __forceinline__ __half AtomicAdd<__half>(__half* addr, __half val) { return atomicAdd(addr, val); } #endif // __CUDA_ARCH__ #endif #define OP(a, b) a * b DEFINE_ATOMIC(Mul) #undef OP } // namespace cuda } // namespace aten } // namespace dgl #endif // DGL_ARRAY_CUDA_ATOMIC_H_