Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e371df51
Commit
e371df51
authored
Jun 27, 2020
by
Chao Liu
Browse files
use buffer load OOB check for padding
parent
7a929377
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
346 additions
and
157 deletions
+346
-157
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+66
-64
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+208
-73
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+72
-20
No files found.
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
e371df51
...
@@ -112,17 +112,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -112,17 +112,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check src data's valid mapping situation, only check the first data in this src
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
// has the valid/invalid mapping situation
if
(
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
transfer_data
<
SrcData
,
{
SrcDataPerRead
,
transfer_data
<
SrcData
,
SrcAddressSpace
,
SrcDataPerRead
,
AddressSpace
::
Vgpr
,
SrcAddressSpace
,
InMemoryDataOperation
::
Set
,
AddressSpace
::
Vgpr
,
SrcDataStride
,
InMemoryDataOperation
::
Set
,
1
>
(
p_src
,
SrcDataStride
,
src_coord
.
GetOffset
(),
1
>
(
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
p_src_long_vector
,
}
buffer_offset
,
true
);
}
}
// SrcData to DstData conversion
// SrcData to DstData conversion
...
@@ -146,17 +147,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -146,17 +147,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check dst data's valid mapping situation, only check the first data in this dst
// Check dst data's valid mapping situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
// has the valid/invalid mapping situation
if
(
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
transfer_data
<
DstData
,
{
DstDataPerWrite
,
transfer_data
<
DstData
,
AddressSpace
::
Vgpr
,
DstDataPerWrite
,
DstAddressSpace
,
AddressSpace
::
Vgpr
,
DstInMemOp
,
DstAddressSpace
,
1
,
DstInMemOp
,
DstDataStride
>
(
p_dst_long_vector
,
1
,
buffer_offset
,
DstDataStride
>
(
true
,
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
p_dst
,
}
dst_coord
.
GetOffset
(),
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
());
}
}
});
});
}
}
...
@@ -266,18 +268,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -266,18 +268,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// src
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
// has the valid/invalid mapping situation
if
(
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
transfer_data
<
SrcData
,
{
SrcDataPerRead
,
transfer_data
<
SrcData
,
SrcAddressSpace
,
SrcDataPerRead
,
AddressSpace
::
Vgpr
,
SrcAddressSpace
,
InMemoryDataOperation
::
Set
>
(
AddressSpace
::
Vgpr
,
p_src
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
,
src_nonlinear_coord
.
GetOffset
()
+
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
src_linear_offset
,
p_src_long_vector
,
p_src_long_vector
,
buffer_offset
,
buffer_offset
);
true
);
}
}
}
// SrcData to DstData conversion
// SrcData to DstData conversion
...
@@ -305,15 +306,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -305,15 +306,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// dst
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
// has the valid/invalid mapping situation
if
(
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
transfer_data
<
DstData
,
{
DstDataPerWrite
,
transfer_data
<
DstData
,
AddressSpace
::
Vgpr
,
DstDataPerWrite
,
DstAddressSpace
,
AddressSpace
::
Vgpr
,
DstInMemOp
>
(
p_dst_long_vector
,
DstAddressSpace
,
buffer_offset
,
DstInMemOp
>
(
true
,
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
p_dst
,
}
dst_coord
.
GetOffset
(),
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
());
}
}
});
});
});
});
...
@@ -405,15 +407,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -405,15 +407,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// src
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
// has the valid/invalid mapping situation
if
(
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
transfer_data
<
SrcData
,
{
SrcDataPerRead
,
transfer_data
<
SrcData
,
SrcAddressSpace
,
SrcDataPerRead
,
AddressSpace
::
Vgpr
,
SrcAddressSpace
,
InMemoryDataOperation
::
Set
>
(
AddressSpace
::
Vgpr
,
p_src
,
InMemoryDataOperation
::
Set
>
(
src_coord
.
GetOffset
(),
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
}
p_src_long_vector
,
buffer_offset
,
true
);
}
}
// SrcData to DstData conversion
// SrcData to DstData conversion
...
@@ -450,18 +454,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -450,18 +454,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// dst
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
// has the valid/invalid mapping situation
if
(
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
transfer_data
<
DstData
,
{
DstDataPerWrite
,
transfer_data
<
DstData
,
AddressSpace
::
Vgpr
,
DstDataPerWrite
,
DstAddressSpace
,
AddressSpace
::
Vgpr
,
DstInMemOp
>
(
p_dst_long_vector
,
DstAddressSpace
,
buffer_offset
,
DstInMemOp
>
(
p_dst_long_vector
,
true
,
buffer_offset
,
p_dst
,
p_dst
,
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
,
dst_nonlinear_coord
.
GetOffset
()
+
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
());
dst_linear_offset
);
}
}
}
});
});
});
});
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
e371df51
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
e371df51
...
@@ -47,10 +47,25 @@ struct SetData
...
@@ -47,10 +47,25 @@ struct SetData
// This version is only for compatibility, don't use this version if possible
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
__device__ void Run(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid) const
{
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
if(dst_valid)
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
{
if(src_valid)
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
}
else
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = 0;
}
}
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
...
@@ -61,11 +76,16 @@ struct SetData
...
@@ -61,11 +76,16 @@ struct SetData
template <>
template <>
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
T* p_dst,
T* p_dst,
index_t dst_offset) const
index_t dst_offset,
bool dst_valid) const
{
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
if(dst_valid)
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid);
}
}
}
// buffer_store requires:
// buffer_store requires:
...
@@ -75,10 +95,15 @@ struct SetData
...
@@ -75,10 +95,15 @@ struct SetData
template <>
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
T* p_dst,
T* p_dst,
index_t dst_offset) const
index_t dst_offset,
bool dst_valid) const
{
{
amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
}
}
#endif
#endif
};
};
...
@@ -90,10 +115,18 @@ struct AtomicAddData
...
@@ -90,10 +115,18 @@ struct AtomicAddData
// This version is only for compatibility, don't use this version if possible
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
__device__ void Run(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid) const
{
{
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
if(src_valid && dst_valid)
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
{
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
...
@@ -104,10 +137,14 @@ struct AtomicAddData
...
@@ -104,10 +137,14 @@ struct AtomicAddData
template <>
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
index_t src_offset,
T* p_dst,
bool src_valid T* p_dst,
index_t dst_offset) const
index_t dst_offset,
bool dst_valid) const
{
{
amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
const auto zeros = vector_t(0);
amd_buffer_atomic_add<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
}
}
#endif
#endif
};
};
...
@@ -119,7 +156,12 @@ template <typename T,
...
@@ -119,7 +156,12 @@ template <typename T,
InMemoryDataOperation DstInMemOp,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride = 1,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
index_t DstDataStride = 1>
__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
__device__ void transfer_data(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid)
{
{
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd,
DstInMemOp == InMemoryDataOperation::AtomicAdd,
...
@@ -131,27 +173,37 @@ __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, inde
...
@@ -131,27 +173,37 @@ __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, inde
// TODO: use static_if::ElseIf
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
p_src, src_offset,
src_valid,
p_dst, dst_offset
, dst_valid
);
});
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
p_src, src_offset,
src_valid,
p_dst, dst_offset
, dst_valid
);
});
});
}
}
else
else
{
{
for(index_t i = 0; i < DataPerAccess;
i
++)
for(index_t i = 0; i < DataPerAccess; ++
i
)
{
{
// TODO: use static_if::ElseIf
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
p_src,
src_offset + i * SrcDataStride,
src_valid,
p_dst,
dst_offset + i * DstDataStride,
dst_valid);
});
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
p_src,
src_offset + i * SrcDataStride,
src_valid,
p_dst,
dst_offset + i * DstDataStride,
dst_valid);
});
});
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment