[Enhancement] Update AtomicAdd functions for BFLOAT16 in common.h (#297)

- Added conditional compilation for BFLOAT16 atomic operations to ensure compatibility with CUDA architectures greater than 7.5. - Improved code clarity by organizing the AtomicAdd functions and adding relevant comments for better understanding.

[Enhancement] Update AtomicAdd functions for BFLOAT16 in common.h (#297)
- Added conditional compilation for BFLOAT16 atomic operations to ensure compatibility with CUDA architectures greater than 7.5. - Improved code clarity by organizing the AtomicAdd functions and adding relevant comments for better understanding.
9ad9d9cd · Lei Wang · LeiWang1999 · 5c8de061 · 9ad9d9cd
Commit 9ad9d9cd authored Mar 28, 2025 by Lei Wang Committed by LeiWang1999 Mar 28, 2025
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

src/tl_templates/cuda/common.h src/tl_templates/cuda/common.h +5 -1

No files found.
--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -115,6 +115,8 @@ template <> TL_DEVICE void AtomicAdd(half_t *address, float val) {
  atomicAdd(reinterpret_cast<half *>(address), __float2half(val));
 }

+// AtomicAdd Functions for BFLOAT16
+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750))
 // AtomicAdd Functions for BFLOAT16
 template <> TL_DEVICE void AtomicAdd(bfloat16_t *address, bfloat16_t *val) {
  atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
@@ -126,13 +128,15 @@ template <> TL_DEVICE void AtomicAdd(bfloat16_t *address, float val) {
  atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address), __float2bfloat16(val));
 }

+#endif
+
 // AtomicAdd Functions for FP16x2
 TL_DEVICE void AtomicAddx2(half_t *address, half_t *val) {
  atomicAdd(reinterpret_cast<half2 *>(address),
            static_cast<half2>(*reinterpret_cast<half2 *>(val)));
 }

-#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 750))
+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750))

 // AtomicAdd Functions for BFLOAT16
 template <> TL_DEVICE void AtomicAdd(bfloat16_t *address, bfloat16_t val) {