Commit ee58ad20 authored by Chao Liu's avatar Chao Liu
Browse files

adding buffer atomic add

parent e38ee30a
...@@ -29,6 +29,11 @@ ...@@ -29,6 +29,11 @@
#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1 #define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
#endif #endif
// only support gfx908
#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
#define CK_USE_AMD_BUFFER_ATOMIC_ADD 0
#endif
// AMD XDLOPS // AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS #ifndef CK_USE_AMD_XDLOPS
#define CK_USE_AMD_XDLOPS 0 #define CK_USE_AMD_XDLOPS 0
......
...@@ -52,12 +52,12 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in ...@@ -52,12 +52,12 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}( static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto) { [&](auto) {
#if 1 #if CK_USE_AMD_BUFFER_ATOMIC_ADD
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
#else
amd_intrinsic_buffer_atomic_add<T, DataPerAccess>( amd_intrinsic_buffer_atomic_add<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0); *reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
#else
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
#endif #endif
}) })
.Else([&](auto fwd) { .Else([&](auto fwd) {
......
...@@ -83,7 +83,7 @@ int main(int argc, char* argv[]) ...@@ -83,7 +83,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
// 1x1 filter, 7x7 image // 1x1 filter, 7x7 image
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 1024; constexpr index_t C = 1024;
...@@ -158,7 +158,7 @@ int main(int argc, char* argv[]) ...@@ -158,7 +158,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<2, 2>; using LeftPads = Sequence<2, 2>;
using RightPads = Sequence<2, 2>; using RightPads = Sequence<2, 2>;
#elif 0 #elif 1
// 1x7 filter, 0x3 pad, 17x17 input // 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 1024; constexpr index_t C = 1024;
...@@ -248,13 +248,13 @@ int main(int argc, char* argv[]) ...@@ -248,13 +248,13 @@ int main(int argc, char* argv[])
#if 0 #if 0
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif 1
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 0 #elif 0
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 1
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
#elif 0 #elif 0
device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw
#else #elif 1
device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
#endif #endif
(in_nchw_desc, (in_nchw_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment