Commit 7a3d9697 authored by Chao Liu's avatar Chao Liu
Browse files

buffer addressing use offset trick

parent 3b07df08
...@@ -49,12 +49,13 @@ ...@@ -49,12 +49,13 @@
#endif #endif
// experimental implementation // experimental implementation
#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0 #endif
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK #ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
......
...@@ -50,9 +50,11 @@ struct SetData ...@@ -50,9 +50,11 @@ struct SetData
__device__ void Run(const T* p_src, __device__ void Run(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t /* src_range */,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t /* dst_range */) const
{ {
if(dst_valid) if(dst_valid)
{ {
...@@ -77,14 +79,16 @@ struct SetData ...@@ -77,14 +79,16 @@ struct SetData
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src, __device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t src_range,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t /* dst_range */) const
{ {
if(dst_valid) if(dst_valid)
{ {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid); amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid, src_range);
} }
} }
...@@ -96,14 +100,16 @@ struct SetData ...@@ -96,14 +100,16 @@ struct SetData
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t /* src_range */,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t dst_range) const
{ {
const auto zeros = vector_t(0); const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>( amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid); src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid, dst_range);
} }
#endif #endif
}; };
...@@ -118,9 +124,11 @@ struct AtomicAddData ...@@ -118,9 +124,11 @@ struct AtomicAddData
__device__ void Run(const T* p_src, __device__ void Run(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t /* src_range */,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t /* dst_range */) const
{ {
if(src_valid && dst_valid) if(src_valid && dst_valid)
{ {
...@@ -137,14 +145,20 @@ struct AtomicAddData ...@@ -137,14 +145,20 @@ struct AtomicAddData
template <> template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset, index_t src_offset,
index_t /* src_range */,
bool src_valid T* p_dst, bool src_valid T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t dst_range) const
{ {
const auto zeros = vector_t(0); const auto zeros = vector_t(0);
amd_buffer_atomic_add<T, DataPerAccess>( amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid); p_dst,
dst_offset,
0,
dst_valid,
index_t dst_range);
} }
#endif #endif
}; };
...@@ -159,9 +173,11 @@ template <typename T, ...@@ -159,9 +173,11 @@ template <typename T,
__device__ void transfer_data(const T* p_src, __device__ void transfer_data(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t src_range,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) bool dst_valid,
index_t dst_range)
{ {
static_assert(DstInMemOp == InMemoryDataOperation::Set || static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd, DstInMemOp == InMemoryDataOperation::AtomicAdd,
...@@ -173,12 +189,12 @@ __device__ void transfer_data(const T* p_src, ...@@ -173,12 +189,12 @@ __device__ void transfer_data(const T* p_src,
// TODO: use static_if::ElseIf // TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>( SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid); p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
}); });
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>( AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid); p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
}); });
} }
else else
...@@ -191,9 +207,11 @@ __device__ void transfer_data(const T* p_src, ...@@ -191,9 +207,11 @@ __device__ void transfer_data(const T* p_src,
p_src, p_src,
src_offset + i * SrcDataStride, src_offset + i * SrcDataStride,
src_valid, src_valid,
src_range,
p_dst, p_dst,
dst_offset + i * DstDataStride, dst_offset + i * DstDataStride,
dst_valid); dst_valid,
dst_range);
}); });
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
...@@ -201,9 +219,11 @@ __device__ void transfer_data(const T* p_src, ...@@ -201,9 +219,11 @@ __device__ void transfer_data(const T* p_src,
p_src, p_src,
src_offset + i * SrcDataStride, src_offset + i * SrcDataStride,
src_valid, src_valid,
src_range,
p_dst, p_dst,
dst_offset + i * DstDataStride, dst_offset + i * DstDataStride,
dst_valid); dst_valid,
dst_range);
}); });
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment