"...api/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "827fad66a02745093de94e8a926f74e896833b2a"
Unverified Commit 59865bdf authored by Zhengju Tang's avatar Zhengju Tang Committed by GitHub
Browse files

[Feature] Add memory_order PTX for vectorized atomic add (#1112)



* [Feature] Add memory_order PTX for vectorized (2x) atomic add

* [Feature] Add memory_order PTX for all vectorized atomic add

* [Lint]

* test

* [BugFix] FIx init optional argument in alloc_var

* bug fix

* bug fix

* lint fix

* lint fix

---------
Co-authored-by: default avatarLei Wang <34334180+LeiWang1999@users.noreply.github.com>
parent 65c4711f
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#endif #endif
#include <cuda/atomic> #include <cuda/atomic>
#include <cuda_fp16.h>
#include <cutlass/numeric_types.h> #include <cutlass/numeric_types.h>
using cutlass::bfloat16_t; using cutlass::bfloat16_t;
...@@ -45,8 +46,9 @@ TL_DEVICE void AtomicMax(T1 &ref, T2 val, ...@@ -45,8 +46,9 @@ TL_DEVICE void AtomicMax(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type; using NT1 = typename normalize_atomic_type<T1>::type;
T1 *address = &ref; T1 *address = &ref;
if constexpr (std::is_same_v<NT1, half> || if constexpr ((std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) { std::is_same_v<NT1, __nv_bfloat16>) &&
memory_order == int(cuda::memory_order_relaxed)) {
atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)); atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
} else { } else {
cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address); cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
...@@ -59,8 +61,9 @@ TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val, ...@@ -59,8 +61,9 @@ TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type; using NT1 = typename normalize_atomic_type<T1>::type;
T1 *address = &ref; T1 *address = &ref;
if constexpr (std::is_same_v<NT1, half> || if constexpr ((std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) { std::is_same_v<NT1, __nv_bfloat16>) &&
memory_order == int(cuda::memory_order_relaxed)) {
return static_cast<T1>( return static_cast<T1>(
atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val))); atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
} else { } else {
...@@ -75,8 +78,9 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val, ...@@ -75,8 +78,9 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type; using NT1 = typename normalize_atomic_type<T1>::type;
T1 *address = &ref; T1 *address = &ref;
if constexpr (std::is_same_v<NT1, half> || if constexpr ((std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) { std::is_same_v<NT1, __nv_bfloat16>) &&
memory_order == int(cuda::memory_order_relaxed)) {
atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)); atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
} else { } else {
cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address); cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
...@@ -89,8 +93,9 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val, ...@@ -89,8 +93,9 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
using NT1 = typename normalize_atomic_type<T1>::type; using NT1 = typename normalize_atomic_type<T1>::type;
T1 *address = &ref; T1 *address = &ref;
if constexpr (std::is_same_v<NT1, half> || if constexpr ((std::is_same_v<NT1, half> ||
std::is_same_v<NT1, __nv_bfloat16>) { std::is_same_v<NT1, __nv_bfloat16>) &&
memory_order == int(cuda::memory_order_relaxed)) {
return static_cast<T1>( return static_cast<T1>(
atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val))); atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
} else { } else {
...@@ -135,59 +140,321 @@ TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val, ...@@ -135,59 +140,321 @@ TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
// TODO add memory_order for vectorized atomic add // TODO add memory_order for vectorized atomic add
TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val, TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<half2 *>(ref), if (memory_order == int(cuda::memory_order_relaxed)) {
static_cast<half2>(*reinterpret_cast<half2 *>(val))); atomicAdd(reinterpret_cast<half2 *>(ref),
static_cast<half2>(*reinterpret_cast<half2 *>(val)));
} else {
// Since atomicAdd does not support memory order, atomic_ref does not
// support vectorized atomic operation we can only inline ptx code here
// Note: Vectorized atomic operations only support global space
// Note: for 16-bit value, we need to reinterpret_cast the value to unsigned
// short and use "h" register in assembly
__half2 add_val = *reinterpret_cast<__half2 *>(val);
unsigned short add_val_x_cast =
*reinterpret_cast<unsigned short *>(&add_val.x);
unsigned short add_val_y_cast =
*reinterpret_cast<unsigned short *>(&add_val.y);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
__half ret_val_x, ret_val_y;
unsigned short ret_val_x_cast =
*reinterpret_cast<unsigned short *>(&ret_val_x);
unsigned short ret_val_y_cast =
*reinterpret_cast<unsigned short *>(&ret_val_y);
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile(
"atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile(
"atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile(
"atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
}
}
} }
TL_DEVICE half2 TL_DEVICE half2
AtomicAddx2Ret(half_t *ref, half_t *val, AtomicAddx2Ret(half_t *ref, half_t *val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<half2 *>(ref), if (memory_order == int(cuda::memory_order_relaxed)) {
static_cast<half2>(*reinterpret_cast<half2 *>(val))); return atomicAdd(reinterpret_cast<half2 *>(ref),
static_cast<half2>(*reinterpret_cast<half2 *>(val)));
} else {
__half2 add_val = *reinterpret_cast<__half2 *>(val);
unsigned short add_val_x_cast =
*reinterpret_cast<unsigned short *>(&add_val.x);
unsigned short add_val_y_cast =
*reinterpret_cast<unsigned short *>(&add_val.y);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
__half ret_val_x, ret_val_y;
unsigned short ret_val_x_cast =
*reinterpret_cast<unsigned short *>(&ret_val_x);
unsigned short ret_val_y_cast =
*reinterpret_cast<unsigned short *>(&ret_val_y);
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile(
"atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile(
"atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile(
"atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
}
return half2(*reinterpret_cast<__half *>(&ret_val_x_cast),
*reinterpret_cast<__half *>(&ret_val_y_cast));
}
} }
#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750)) #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750))
TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val, TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
atomicAdd( if (memory_order == int(cuda::memory_order_relaxed)) {
reinterpret_cast<__nv_bfloat162 *>(ref), atomicAdd(
static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val))); reinterpret_cast<__nv_bfloat162 *>(ref),
static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
} else {
__nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
unsigned short add_val_x_cast =
*reinterpret_cast<unsigned short *>(&add_val.x);
unsigned short add_val_y_cast =
*reinterpret_cast<unsigned short *>(&add_val.y);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
__nv_bfloat162 ret_val;
unsigned short ret_val_x_cast =
*reinterpret_cast<unsigned short *>(&ret_val.x);
unsigned short ret_val_y_cast =
*reinterpret_cast<unsigned short *>(&ret_val.y);
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
}
}
} }
TL_DEVICE __nv_bfloat162 TL_DEVICE __nv_bfloat162
AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val, AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
return atomicAdd( if (memory_order == int(cuda::memory_order_relaxed)) {
reinterpret_cast<__nv_bfloat162 *>(ref), return atomicAdd(
static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val))); reinterpret_cast<__nv_bfloat162 *>(ref),
static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
} else {
__nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
unsigned short add_val_x_cast =
*reinterpret_cast<unsigned short *>(&add_val.x);
unsigned short add_val_y_cast =
*reinterpret_cast<unsigned short *>(&add_val.y);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
__nv_bfloat162 ret_val;
unsigned short ret_val_x_cast =
*reinterpret_cast<unsigned short *>(&ret_val.x);
unsigned short ret_val_y_cast =
*reinterpret_cast<unsigned short *>(&ret_val.y);
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
: "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
: "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
: "memory");
}
return __nv_bfloat162(*reinterpret_cast<__nv_bfloat16 *>(&ret_val_x_cast),
*reinterpret_cast<__nv_bfloat16 *>(&ret_val_y_cast));
}
} }
#endif #endif
#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900)) #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
TL_DEVICE void AtomicAddx2(float *ref, float *val, TL_DEVICE void AtomicAddx2(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<float2 *>(ref), if (memory_order == int(cuda::memory_order_relaxed)) {
static_cast<float2>(*reinterpret_cast<float2 *>(val))); atomicAdd(reinterpret_cast<float2 *>(ref),
static_cast<float2>(*reinterpret_cast<float2 *>(val)));
} else {
float2 add_val = *reinterpret_cast<float2 *>(val);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
float2 ret_val;
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
}
}
} }
TL_DEVICE float2 TL_DEVICE float2
AtomicAddx2Ret(float *ref, float *val, AtomicAddx2Ret(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<float2 *>(ref), if (memory_order == int(cuda::memory_order_relaxed)) {
static_cast<float2>(*reinterpret_cast<float2 *>(val))); return atomicAdd(reinterpret_cast<float2 *>(ref),
static_cast<float2>(*reinterpret_cast<float2 *>(val)));
} else {
float2 add_val = *reinterpret_cast<float2 *>(val);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
float2 ret_val;
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
: "=f"(ret_val.x), "=f"(ret_val.y)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
: "memory");
}
return ret_val;
}
} }
TL_DEVICE void AtomicAddx4(float *ref, float *val, TL_DEVICE void AtomicAddx4(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
atomicAdd(reinterpret_cast<float4 *>(ref), if (memory_order == int(cuda::memory_order_relaxed)) {
static_cast<float4>(*reinterpret_cast<float4 *>(val))); atomicAdd(reinterpret_cast<float4 *>(ref),
static_cast<float4>(*reinterpret_cast<float4 *>(val)));
} else {
// Since atomicAdd does not support memory order, atomic_ref does not
// support vectorized atomic operation we can only inline ptx code here
// Note: Vectorized atomic operations only support global space
float4 add_val = *reinterpret_cast<float4 *>(val);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
float4 ret_val;
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
}
}
} }
TL_DEVICE float4 TL_DEVICE float4
AtomicAddx4Ret(float *ref, float *val, AtomicAddx4Ret(float *ref, float *val,
int memory_order = int(cuda::memory_order_relaxed)) { int memory_order = int(cuda::memory_order_relaxed)) {
return atomicAdd(reinterpret_cast<float4 *>(ref), if (memory_order == int(cuda::memory_order_relaxed)) {
static_cast<float4>(*reinterpret_cast<float4 *>(val))); return atomicAdd(reinterpret_cast<float4 *>(ref),
static_cast<float4>(*reinterpret_cast<float4 *>(val)));
} else {
float4 add_val = *reinterpret_cast<float4 *>(val);
unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
float4 ret_val;
if (memory_order == int(cuda::memory_order_release) ||
memory_order == int(cuda::memory_order_consume)) {
asm volatile("atom.global.gpu.release.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
} else if (memory_order == int(cuda::memory_order_acquire)) {
asm volatile("atom.global.gpu.acquire.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
} else if (memory_order == int(cuda::memory_order_acq_rel) ||
memory_order == int(cuda::memory_order_seq_cst)) {
asm volatile("atom.global.gpu.acq_rel.add.v4.f32 {%0,%1,%2,%3}, [%4], "
"{%5,%6,%7,%8};"
: "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
"=f"(ret_val.w)
: "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
"f"(add_val.z), "f"(add_val.w)
: "memory");
}
return ret_val;
}
} }
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment