Commit e371df51 authored by Chao Liu's avatar Chao Liu
Browse files

use buffer load OOB check for padding

parent 7a929377
......@@ -112,17 +112,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>(p_src,
src_coord.GetOffset(),
src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
p_src_long_vector,
buffer_offset,
true);
}
// SrcData to DstData conversion
......@@ -146,17 +147,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check dst data's valid mapping situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>(
p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset());
}
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>(p_dst_long_vector,
buffer_offset,
true,
p_dst,
dst_coord.GetOffset(),
dst_coord.IsOffsetValidAssumingUpperIndexIsValid());
}
});
}
......@@ -266,18 +268,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(p_src,
src_nonlinear_coord.GetOffset() +
src_linear_offset,
p_src_long_vector,
buffer_offset);
}
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src,
src_nonlinear_coord.GetOffset() + src_linear_offset,
src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
p_src_long_vector,
buffer_offset,
true);
}
// SrcData to DstData conversion
......@@ -305,15 +306,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp>(
p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset());
}
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp>(p_dst_long_vector,
buffer_offset,
true,
p_dst,
dst_coord.GetOffset(),
dst_coord.IsOffsetValidAssumingUpperIndexIsValid());
}
});
});
......@@ -405,15 +407,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src,
src_coord.GetOffset(),
src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
p_src_long_vector,
buffer_offset,
true);
}
// SrcData to DstData conversion
......@@ -450,18 +454,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp>(p_dst_long_vector,
buffer_offset,
p_dst,
dst_nonlinear_coord.GetOffset() +
dst_linear_offset);
}
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp>(p_dst_long_vector,
buffer_offset,
true,
p_dst,
dst_nonlinear_coord.GetOffset() + dst_linear_offset,
dst_coord.IsOffsetValidAssumingUpperIndexIsValid());
}
});
});
......
......@@ -47,10 +47,25 @@ struct SetData
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
__device__ void Run(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid) const
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
if(dst_valid)
{
if(src_valid)
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
}
else
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = 0;
}
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING
......@@ -61,11 +76,16 @@ struct SetData
template <>
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset) const
index_t dst_offset,
bool dst_valid) const
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
if(dst_valid)
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid);
}
}
// buffer_store requires:
......@@ -75,10 +95,15 @@ struct SetData
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset) const
index_t dst_offset,
bool dst_valid) const
{
amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
}
#endif
};
......@@ -90,10 +115,18 @@ struct AtomicAddData
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
__device__ void Run(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid) const
{
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
if(src_valid && dst_valid)
{
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
......@@ -104,10 +137,14 @@ struct AtomicAddData
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
T* p_dst,
index_t dst_offset) const
bool src_valid T* p_dst,
index_t dst_offset,
bool dst_valid) const
{
amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
const auto zeros = vector_t(0);
amd_buffer_atomic_add<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
}
#endif
};
......@@ -119,7 +156,12 @@ template <typename T,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
__device__ void transfer_data(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid)
{
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd,
......@@ -131,27 +173,37 @@ __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, inde
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid);
});
}
else
{
for(index_t i = 0; i < DataPerAccess; i++)
for(index_t i = 0; i < DataPerAccess; ++i)
{
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
p_src,
src_offset + i * SrcDataStride,
src_valid,
p_dst,
dst_offset + i * DstDataStride,
dst_valid);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
p_src,
src_offset + i * SrcDataStride,
src_valid,
p_dst,
dst_offset + i * DstDataStride,
dst_valid);
});
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment