Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7a3d9697
Commit
7a3d9697
authored
Jun 28, 2020
by
Chao Liu
Browse files
buffer addressing use offset trick
parent
3b07df08
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
734 additions
and
411 deletions
+734
-411
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+694
-392
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+6
-5
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+34
-14
No files found.
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
7a3d9697
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/config.amd.hpp.in
View file @
7a3d9697
...
@@ -49,12 +49,13 @@
...
@@ -49,12 +49,13 @@
#endif
#endif
// experimental implementation
// experimental implementation
#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
#endif
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
7a3d9697
...
@@ -50,9 +50,11 @@ struct SetData
...
@@ -50,9 +50,11 @@ struct SetData
__device__ void Run(const T* p_src,
__device__ void Run(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t /* src_range */,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t /* dst_range */) const
{
{
if(dst_valid)
if(dst_valid)
{
{
...
@@ -77,14 +79,16 @@ struct SetData
...
@@ -77,14 +79,16 @@ struct SetData
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t src_range,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t /* dst_range */) const
{
{
if(dst_valid)
if(dst_valid)
{
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid);
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid
, src_range
);
}
}
}
}
...
@@ -96,14 +100,16 @@ struct SetData
...
@@ -96,14 +100,16 @@ struct SetData
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t /* src_range */,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t dst_range) const
{
{
const auto zeros = vector_t(0);
const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid
, dst_range
);
}
}
#endif
#endif
};
};
...
@@ -118,9 +124,11 @@ struct AtomicAddData
...
@@ -118,9 +124,11 @@ struct AtomicAddData
__device__ void Run(const T* p_src,
__device__ void Run(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t /* src_range */,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t /* dst_range */) const
{
{
if(src_valid && dst_valid)
if(src_valid && dst_valid)
{
{
...
@@ -137,14 +145,20 @@ struct AtomicAddData
...
@@ -137,14 +145,20 @@ struct AtomicAddData
template <>
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
index_t src_offset,
index_t /* src_range */,
bool src_valid T* p_dst,
bool src_valid T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t dst_range) const
{
{
const auto zeros = vector_t(0);
const auto zeros = vector_t(0);
amd_buffer_atomic_add<T, DataPerAccess>(
amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
p_dst,
dst_offset,
0,
dst_valid,
index_t dst_range);
}
}
#endif
#endif
};
};
...
@@ -159,9 +173,11 @@ template <typename T,
...
@@ -159,9 +173,11 @@ template <typename T,
__device__ void transfer_data(const T* p_src,
__device__ void transfer_data(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t src_range,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid)
bool dst_valid,
index_t dst_range)
{
{
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd,
DstInMemOp == InMemoryDataOperation::AtomicAdd,
...
@@ -173,12 +189,12 @@ __device__ void transfer_data(const T* p_src,
...
@@ -173,12 +189,12 @@ __device__ void transfer_data(const T* p_src,
// TODO: use static_if::ElseIf
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid);
p_src, src_offset, src_valid,
src_range,
p_dst, dst_offset, dst_valid
, dst_range
);
});
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid);
p_src, src_offset, src_valid,
src_range,
p_dst, dst_offset, dst_valid
, dst_range
);
});
});
}
}
else
else
...
@@ -191,9 +207,11 @@ __device__ void transfer_data(const T* p_src,
...
@@ -191,9 +207,11 @@ __device__ void transfer_data(const T* p_src,
p_src,
p_src,
src_offset + i * SrcDataStride,
src_offset + i * SrcDataStride,
src_valid,
src_valid,
src_range,
p_dst,
p_dst,
dst_offset + i * DstDataStride,
dst_offset + i * DstDataStride,
dst_valid);
dst_valid,
dst_range);
});
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
...
@@ -201,9 +219,11 @@ __device__ void transfer_data(const T* p_src,
...
@@ -201,9 +219,11 @@ __device__ void transfer_data(const T* p_src,
p_src,
p_src,
src_offset + i * SrcDataStride,
src_offset + i * SrcDataStride,
src_valid,
src_valid,
src_range,
p_dst,
p_dst,
dst_offset + i * DstDataStride,
dst_offset + i * DstDataStride,
dst_valid);
dst_valid,
dst_range);
});
});
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment