Unverified Commit 17bbc0ca authored by Lei Wang's avatar Lei Wang Committed by GitHub
Browse files

[Bugfix] Fallback to the old AtomicAdd implementation for legacy architectures (#1306)

parent 2426090f
...@@ -169,6 +169,7 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val, ...@@ -169,6 +169,7 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
} }
} }
#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 890))
template <typename T1, typename T2> template <typename T1, typename T2>
TL_DEVICE void AtomicAdd(T1 &ref, T2 val, TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
...@@ -236,14 +237,18 @@ TL_DEVICE void AtomicAdd(T1 &ref, T2 val, ...@@ -236,14 +237,18 @@ TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
} }
} }
} else { } else {
#if CUDART_VERSION >= 11080 atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
#else
TL_NOT_IMPLEMENTED();
#endif
} }
} }
#else
template <typename T1, typename T2>
TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type;
(void)memory_order;
atomicAdd(reinterpret_cast<NT1 *>(&ref), cuda_cast<NT1>(val));
}
#endif
template <typename T1, typename T2> template <typename T1, typename T2>
TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val, TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
...@@ -643,6 +648,48 @@ AtomicAddx4Ret(float *ref, float *val, ...@@ -643,6 +648,48 @@ AtomicAddx4Ret(float *ref, float *val,
return ret_val; return ret_val;
} }
} }
#else
TL_DEVICE void AtomicAddx2(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) {
(void)memory_order;
float2 add_val = *reinterpret_cast<float2 *>(val);
atomicAdd(ref + 0, add_val.x);
atomicAdd(ref + 1, add_val.y);
}
TL_DEVICE float2
AtomicAddx2Ret(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) {
(void)memory_order;
float2 add_val = *reinterpret_cast<float2 *>(val);
float2 ret;
ret.x = atomicAdd(ref + 0, add_val.x);
ret.y = atomicAdd(ref + 1, add_val.y);
return ret;
}
TL_DEVICE void AtomicAddx4(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) {
(void)memory_order;
float4 add_val = *reinterpret_cast<float4 *>(val);
atomicAdd(ref + 0, add_val.x);
atomicAdd(ref + 1, add_val.y);
atomicAdd(ref + 2, add_val.z);
atomicAdd(ref + 3, add_val.w);
}
TL_DEVICE float4
AtomicAddx4Ret(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) {
(void)memory_order;
float4 add_val = *reinterpret_cast<float4 *>(val);
float4 ret;
ret.x = atomicAdd(ref + 0, add_val.x);
ret.y = atomicAdd(ref + 1, add_val.y);
ret.z = atomicAdd(ref + 2, add_val.z);
ret.w = atomicAdd(ref + 3, add_val.w);
return ret;
}
#endif #endif
template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) { template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment