Commit 91e0de2e authored by Chao Liu's avatar Chao Liu
Browse files

adding atomic add intrinsic

parent b8385cca
...@@ -54,6 +54,13 @@ __device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata, ...@@ -54,6 +54,13 @@ __device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f32"); bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");
__device__ void
__llvm_amdgcn_buffer_atomic_add(float vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32");
// buffer_load requires: // buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr // 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer. // 2) p_src to be a block-invariant pointer.
...@@ -73,6 +80,13 @@ amd_intrinsic_buffer_store(const typename vector_type<T, VectorSize>::MemoryType ...@@ -73,6 +80,13 @@ amd_intrinsic_buffer_store(const typename vector_type<T, VectorSize>::MemoryType
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset); index_t dst_const_data_offset);
template <typename T, index_t VectorSize>
__device__ void
amd_intrinsic_buffer_atomic_add(const typename vector_type<T, VectorSize>::MemoryType& src,
T* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset);
template <> template <>
__device__ float amd_intrinsic_buffer_load<float, 1>(const float* p_src_block, __device__ float amd_intrinsic_buffer_load<float, 1>(const float* p_src_block,
index_t src_thread_data_offset, index_t src_thread_data_offset,
...@@ -289,5 +303,31 @@ __device__ void amd_intrinsic_buffer_store<float, 4>(const float4_t& src, ...@@ -289,5 +303,31 @@ __device__ void amd_intrinsic_buffer_store<float, 4>(const float4_t& src,
#endif #endif
} }
template <>
__device__ void amd_intrinsic_buffer_atomic_add<float, 1>(const float& src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> dst_block_config;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
__llvm_amdgcn_buffer_atomic_add(
src, dst_block_config.data, 0, dst_thread_addr_offset + dst_const_addr_offset, false);
#else
static_assert(false, " wrong! not implemented");
#endif
}
} // namespace ck } // namespace ck
#endif #endif
...@@ -52,8 +52,13 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in ...@@ -52,8 +52,13 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}( static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto) { [&](auto) {
#if 1
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]), atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset])); *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
#else
amd_intrinsic_buffer_atomic_add<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
#endif
}) })
.Else([&](auto fwd) { .Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space"); static_assert(fwd(false), "atomic_add doesn't support this memory space");
......
...@@ -29,7 +29,7 @@ int main(int argc, char* argv[]) ...@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
constexpr index_t C = 128; constexpr index_t C = 128;
constexpr index_t HI = 35; constexpr index_t HI = 35;
constexpr index_t WI = 35; constexpr index_t WI = 35;
constexpr index_t K = 128; constexpr index_t K = 1024;
constexpr index_t Y = 3; constexpr index_t Y = 3;
constexpr index_t X = 3; constexpr index_t X = 3;
...@@ -246,9 +246,9 @@ int main(int argc, char* argv[]) ...@@ -246,9 +246,9 @@ int main(int argc, char* argv[])
#endif #endif
} }
#if 0 #if 1
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif 0 #elif 1
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 0 #elif 0
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment