[Feature] Add memory_order PTX for vectorized atomic add (#1112)

* [Feature] Add memory_order PTX for vectorized (2x) atomic add * [Feature] Add memory_order PTX for all vectorized atomic add * [Lint] * test * [BugFix] FIx init optional argument in alloc_var * bug fix * bug fix * lint fix * lint fix --------- Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>

[Feature] Add memory_order PTX for vectorized atomic add (#1112)
* [Feature] Add memory_order PTX for vectorized (2x) atomic add * [Feature] Add memory_order PTX for all vectorized atomic add * [Lint] * test * [BugFix] FIx init optional argument in alloc_var * bug fix * bug fix * lint fix * lint fix --------- Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
59865bdf · Zhengju Tang · GitHub · 65c4711f · 59865bdf
Unverified Commit 59865bdf authored Oct 25, 2025 by Zhengju Tang Committed by GitHub Oct 25, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 293 additions and 26 deletions

src/tl_templates/cuda/atomic.h src/tl_templates/cuda/atomic.h +293 -26

No files found.
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -5,6 +5,7 @@
 #endif
 #include <cuda/atomic>
+#include <cuda_fp16.h>
 #include <cutlass/numeric_types.h>
 using cutlass::bfloat16_t;
@@ -45,8 +46,9 @@ TL_DEVICE void AtomicMax(T1 &ref, T2 val,
                         int memory_order = int(cuda::memory_order_relaxed)) {
  using NT1 = typename normalize_atomic_type<T1>::type;
  T1 *address = &ref;
-  if constexpr (std::is_same_v<NT1, half> ||
+  if constexpr ((std::is_same_v<NT1, half> ||
-                std::is_same_v<NT1, __nv_bfloat16>) {
+                 std::is_same_v<NT1, __nv_bfloat16>) &&
+                memory_order == int(cuda::memory_order_relaxed)) {
    atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
  } else {
    cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
@@ -59,8 +61,9 @@ TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
  using NT1 = typename normalize_atomic_type<T1>::type;
  T1 *address = &ref;
-  if constexpr (std::is_same_v<NT1, half> ||
+  if constexpr ((std::is_same_v<NT1, half> ||
-                std::is_same_v<NT1, __nv_bfloat16>) {
+                 std::is_same_v<NT1, __nv_bfloat16>) &&
+                memory_order == int(cuda::memory_order_relaxed)) {
    return static_cast<T1>(
        atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
  } else {
@@ -75,8 +78,9 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
                         int memory_order = int(cuda::memory_order_relaxed)) {
  using NT1 = typename normalize_atomic_type<T1>::type;
  T1 *address = &ref;
-  if constexpr (std::is_same_v<NT1, half> ||
+  if constexpr ((std::is_same_v<NT1, half> ||
-                std::is_same_v<NT1, __nv_bfloat16>) {
+                 std::is_same_v<NT1, __nv_bfloat16>) &&
+                memory_order == int(cuda::memory_order_relaxed)) {
    atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
  } else {
    cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
@@ -89,8 +93,9 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
  using NT1 = typename normalize_atomic_type<T1>::type;
  T1 *address = &ref;
-  if constexpr (std::is_same_v<NT1, half> ||
+  if constexpr ((std::is_same_v<NT1, half> ||
-                std::is_same_v<NT1, __nv_bfloat16>) {
+                 std::is_same_v<NT1, __nv_bfloat16>) &&
+                memory_order == int(cuda::memory_order_relaxed)) {
    return static_cast<T1>(
        atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
  } else {
@@ -135,59 +140,321 @@ TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
 // TODO add memory_order for vectorized atomic add
 TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
-  atomicAdd(reinterpret_cast<half2 *>(ref),
+  if (memory_order == int(cuda::memory_order_relaxed)) {
-            static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+    atomicAdd(reinterpret_cast<half2 *>(ref),
+              static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+  } else {
+    // Since atomicAdd does not support memory order, atomic_ref does not
+    // support vectorized atomic operation we can only inline ptx code here
+    // Note: Vectorized atomic operations only support global space
+    // Note: for 16-bit value, we need to reinterpret_cast the value to unsigned
+    // short and use "h" register in assembly
+    __half2 add_val = *reinterpret_cast<__half2 *>(val);
+    unsigned short add_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.x);
+    unsigned short add_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    __half ret_val_x, ret_val_y;
+    unsigned short ret_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val_x);
+    unsigned short ret_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val_y);
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile(
+          "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile(
+          "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile(
+          "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    }
+  }
 }
 TL_DEVICE half2
 AtomicAddx2Ret(half_t *ref, half_t *val,
               int memory_order = int(cuda::memory_order_relaxed)) {
-  return atomicAdd(reinterpret_cast<half2 *>(ref),
+  if (memory_order == int(cuda::memory_order_relaxed)) {
-                   static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+    return atomicAdd(reinterpret_cast<half2 *>(ref),
+                     static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+  } else {
+    __half2 add_val = *reinterpret_cast<__half2 *>(val);
+    unsigned short add_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.x);
+    unsigned short add_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    __half ret_val_x, ret_val_y;
+    unsigned short ret_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val_x);
+    unsigned short ret_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val_y);
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile(
+          "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile(
+          "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile(
+          "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    }
+    return half2(*reinterpret_cast<__half *>(&ret_val_x_cast),
+                 *reinterpret_cast<__half *>(&ret_val_y_cast));
+  }
 }
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750))
 TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
-  atomicAdd(
+  if (memory_order == int(cuda::memory_order_relaxed)) {
-      reinterpret_cast<__nv_bfloat162 *>(ref),
+    atomicAdd(
-      static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+        reinterpret_cast<__nv_bfloat162 *>(ref),
+        static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+  } else {
+    __nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
+    unsigned short add_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.x);
+    unsigned short add_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    __nv_bfloat162 ret_val;
+    unsigned short ret_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val.x);
+    unsigned short ret_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val.y);
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    }
+  }
 }
 TL_DEVICE __nv_bfloat162
 AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val,
               int memory_order = int(cuda::memory_order_relaxed)) {
-  return atomicAdd(
+  if (memory_order == int(cuda::memory_order_relaxed)) {
-      reinterpret_cast<__nv_bfloat162 *>(ref),
+    return atomicAdd(
-      static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+        reinterpret_cast<__nv_bfloat162 *>(ref),
+        static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+  } else {
+    __nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
+    unsigned short add_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.x);
+    unsigned short add_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    __nv_bfloat162 ret_val;
+    unsigned short ret_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val.x);
+    unsigned short ret_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val.y);
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    }
+    return __nv_bfloat162(*reinterpret_cast<__nv_bfloat16 *>(&ret_val_x_cast),
+                          *reinterpret_cast<__nv_bfloat16 *>(&ret_val_y_cast));
+  }
 }
 #endif
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
 TL_DEVICE void AtomicAddx2(float *ref, float *val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
-  atomicAdd(reinterpret_cast<float2 *>(ref),
+  if (memory_order == int(cuda::memory_order_relaxed)) {
-            static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+    atomicAdd(reinterpret_cast<float2 *>(ref),
+              static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+  } else {
+    float2 add_val = *reinterpret_cast<float2 *>(val);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    float2 ret_val;
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    }
+  }
 }
 TL_DEVICE float2
 AtomicAddx2Ret(float *ref, float *val,
               int memory_order = int(cuda::memory_order_relaxed)) {
-  return atomicAdd(reinterpret_cast<float2 *>(ref),
+  if (memory_order == int(cuda::memory_order_relaxed)) {
-                   static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+    return atomicAdd(reinterpret_cast<float2 *>(ref),
+                     static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+  } else {
+    float2 add_val = *reinterpret_cast<float2 *>(val);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    float2 ret_val;
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    }
+    return ret_val;
+  }
 }
 TL_DEVICE void AtomicAddx4(float *ref, float *val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
-  atomicAdd(reinterpret_cast<float4 *>(ref),
+  if (memory_order == int(cuda::memory_order_relaxed)) {
-            static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+    atomicAdd(reinterpret_cast<float4 *>(ref),
+              static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+  } else {
+    // Since atomicAdd does not support memory order, atomic_ref does not
+    // support vectorized atomic operation we can only inline ptx code here
+    // Note: Vectorized atomic operations only support global space
+    float4 add_val = *reinterpret_cast<float4 *>(val);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    float4 ret_val;
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    }
+  }
 }
 TL_DEVICE float4
 AtomicAddx4Ret(float *ref, float *val,
               int memory_order = int(cuda::memory_order_relaxed)) {
-  return atomicAdd(reinterpret_cast<float4 *>(ref),
+  if (memory_order == int(cuda::memory_order_relaxed)) {
-                   static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+    return atomicAdd(reinterpret_cast<float4 *>(ref),
+                     static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+  } else {
+    float4 add_val = *reinterpret_cast<float4 *>(val);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    float4 ret_val;
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.global.gpu.release.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.global.gpu.acquire.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.global.gpu.acq_rel.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    }
+    return ret_val;
+  }
 }
 #endif