"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "ee222211e1d9c57b2b59037e4916e297d93a60b0"
Commit e371df51 authored by Chao Liu's avatar Chao Liu
Browse files

use buffer load OOB check for padding

parent 7a929377
...@@ -112,17 +112,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -112,17 +112,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check src data's valid mapping situation, only check the first data in this src // Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector // vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation // has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid()) transfer_data<SrcData,
{ SrcDataPerRead,
transfer_data<SrcData, SrcAddressSpace,
SrcDataPerRead, AddressSpace::Vgpr,
SrcAddressSpace, InMemoryDataOperation::Set,
AddressSpace::Vgpr, SrcDataStride,
InMemoryDataOperation::Set, 1>(p_src,
SrcDataStride, src_coord.GetOffset(),
1>( src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset); p_src_long_vector,
} buffer_offset,
true);
} }
// SrcData to DstData conversion // SrcData to DstData conversion
...@@ -146,17 +147,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -146,17 +147,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check dst data's valid mapping situation, only check the first data in this dst // Check dst data's valid mapping situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector // vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation // has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid()) transfer_data<DstData,
{ DstDataPerWrite,
transfer_data<DstData, AddressSpace::Vgpr,
DstDataPerWrite, DstAddressSpace,
AddressSpace::Vgpr, DstInMemOp,
DstAddressSpace, 1,
DstInMemOp, DstDataStride>(p_dst_long_vector,
1, buffer_offset,
DstDataStride>( true,
p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset()); p_dst,
} dst_coord.GetOffset(),
dst_coord.IsOffsetValidAssumingUpperIndexIsValid());
} }
}); });
} }
...@@ -266,18 +268,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -266,18 +268,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// src // src
// vector. It's user's responsiblity to make sure all data in the src vector // vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation // has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid()) transfer_data<SrcData,
{ SrcDataPerRead,
transfer_data<SrcData, SrcAddressSpace,
SrcDataPerRead, AddressSpace::Vgpr,
SrcAddressSpace, InMemoryDataOperation::Set>(
AddressSpace::Vgpr, p_src,
InMemoryDataOperation::Set>(p_src, src_nonlinear_coord.GetOffset() + src_linear_offset,
src_nonlinear_coord.GetOffset() + src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
src_linear_offset, p_src_long_vector,
p_src_long_vector, buffer_offset,
buffer_offset); true);
}
} }
// SrcData to DstData conversion // SrcData to DstData conversion
...@@ -305,15 +306,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -305,15 +306,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// dst // dst
// vector. It's user's responsiblity to make sure all data in the dst vector // vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation // has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid()) transfer_data<DstData,
{ DstDataPerWrite,
transfer_data<DstData, AddressSpace::Vgpr,
DstDataPerWrite, DstAddressSpace,
AddressSpace::Vgpr, DstInMemOp>(p_dst_long_vector,
DstAddressSpace, buffer_offset,
DstInMemOp>( true,
p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset()); p_dst,
} dst_coord.GetOffset(),
dst_coord.IsOffsetValidAssumingUpperIndexIsValid());
} }
}); });
}); });
...@@ -405,15 +407,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -405,15 +407,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// src // src
// vector. It's user's responsiblity to make sure all data in the src vector // vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation // has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid()) transfer_data<SrcData,
{ SrcDataPerRead,
transfer_data<SrcData, SrcAddressSpace,
SrcDataPerRead, AddressSpace::Vgpr,
SrcAddressSpace, InMemoryDataOperation::Set>(
AddressSpace::Vgpr, p_src,
InMemoryDataOperation::Set>( src_coord.GetOffset(),
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset); src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
} p_src_long_vector,
buffer_offset,
true);
} }
// SrcData to DstData conversion // SrcData to DstData conversion
...@@ -450,18 +454,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -450,18 +454,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// dst // dst
// vector. It's user's responsiblity to make sure all data in the dst vector // vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation // has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid()) transfer_data<DstData,
{ DstDataPerWrite,
transfer_data<DstData, AddressSpace::Vgpr,
DstDataPerWrite, DstAddressSpace,
AddressSpace::Vgpr, DstInMemOp>(p_dst_long_vector,
DstAddressSpace, buffer_offset,
DstInMemOp>(p_dst_long_vector, true,
buffer_offset, p_dst,
p_dst, dst_nonlinear_coord.GetOffset() + dst_linear_offset,
dst_nonlinear_coord.GetOffset() + dst_coord.IsOffsetValidAssumingUpperIndexIsValid());
dst_linear_offset);
}
} }
}); });
}); });
......
...@@ -47,10 +47,25 @@ struct SetData ...@@ -47,10 +47,25 @@ struct SetData
// This version is only for compatibility, don't use this version if possible // This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace> template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const __device__ void Run(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid) const
{ {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = if(dst_valid)
*reinterpret_cast<const vector_t*>(&p_src[src_offset]); {
if(src_valid)
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
}
else
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = 0;
}
}
} }
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
...@@ -61,11 +76,16 @@ struct SetData ...@@ -61,11 +76,16 @@ struct SetData
template <> template <>
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src, __device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid,
T* p_dst, T* p_dst,
index_t dst_offset) const index_t dst_offset,
bool dst_valid) const
{ {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = if(dst_valid)
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0); {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid);
}
} }
// buffer_store requires: // buffer_store requires:
...@@ -75,10 +95,15 @@ struct SetData ...@@ -75,10 +95,15 @@ struct SetData
template <> template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid,
T* p_dst, T* p_dst,
index_t dst_offset) const index_t dst_offset,
bool dst_valid) const
{ {
amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0); const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
} }
#endif #endif
}; };
...@@ -90,10 +115,18 @@ struct AtomicAddData ...@@ -90,10 +115,18 @@ struct AtomicAddData
// This version is only for compatibility, don't use this version if possible // This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace> template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const __device__ void Run(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid) const
{ {
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]), if(src_valid && dst_valid)
*reinterpret_cast<const vector_t*>(&p_src[src_offset])); {
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}
} }
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD #if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
...@@ -104,10 +137,14 @@ struct AtomicAddData ...@@ -104,10 +137,14 @@ struct AtomicAddData
template <> template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset, index_t src_offset,
T* p_dst, bool src_valid T* p_dst,
index_t dst_offset) const index_t dst_offset,
bool dst_valid) const
{ {
amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0); const auto zeros = vector_t(0);
amd_buffer_atomic_add<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
} }
#endif #endif
}; };
...@@ -119,7 +156,12 @@ template <typename T, ...@@ -119,7 +156,12 @@ template <typename T,
InMemoryDataOperation DstInMemOp, InMemoryDataOperation DstInMemOp,
index_t SrcDataStride = 1, index_t SrcDataStride = 1,
index_t DstDataStride = 1> index_t DstDataStride = 1>
__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) __device__ void transfer_data(const T* p_src,
index_t src_offset,
bool src_valid,
T* p_dst,
index_t dst_offset,
bool dst_valid)
{ {
static_assert(DstInMemOp == InMemoryDataOperation::Set || static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd, DstInMemOp == InMemoryDataOperation::AtomicAdd,
...@@ -131,27 +173,37 @@ __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, inde ...@@ -131,27 +173,37 @@ __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, inde
// TODO: use static_if::ElseIf // TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>( SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset); p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid);
}); });
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>( AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset); p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid);
}); });
} }
else else
{ {
for(index_t i = 0; i < DataPerAccess; i++) for(index_t i = 0; i < DataPerAccess; ++i)
{ {
// TODO: use static_if::ElseIf // TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>( SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride); p_src,
src_offset + i * SrcDataStride,
src_valid,
p_dst,
dst_offset + i * DstDataStride,
dst_valid);
}); });
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>( AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride); p_src,
src_offset + i * SrcDataStride,
src_valid,
p_dst,
dst_offset + i * DstDataStride,
dst_valid);
}); });
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment