Commit 3b07df08 authored by Chao Liu's avatar Chao Liu
Browse files

remove threadwise copy Run_optimized_src_address_calculation

parent e371df51
...@@ -84,46 +84,23 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -84,46 +84,23 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer) const ThreadBufferData* p_thread_buffer) const
{ {
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
if(BlockSize == mThreadClusterDesc.GetElementSize() or if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize()) get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseLoad.Run_optimized_src_address_calculation(p_block_src, p_thread_buffer);
}
else
{ {
mThreadwiseLoad.Run(p_block_src, p_thread_buffer); mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
} }
} }
}
template <typename ThreadBufferData, typename BlockDstData> template <typename ThreadBufferData, typename BlockDstData>
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const BlockDstData* p_block_dst) const
{ {
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
if(BlockSize == mThreadClusterDesc.GetElementSize() or if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize()) get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseStore.Run_optimized_dst_address_calculation(p_thread_buffer,
p_block_dst);
}
else
{ {
mThreadwiseStore.Run(p_thread_buffer, p_block_dst); mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
} }
} }
}
template <typename BlockSrcData, typename BlockDstData> template <typename BlockSrcData, typename BlockDstData>
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
......
...@@ -93,11 +93,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -93,11 +93,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// buffer to hold a src long-vector // buffer to hold a src long-vector
SrcData p_src_long_vector[long_vector_size]; SrcData p_src_long_vector[long_vector_size];
#if 1
// zero out buffer // zero out buffer
for(index_t i = 0; i < long_vector_size; ++i) for(index_t i = 0; i < long_vector_size; ++i)
{ {
p_src_long_vector[i] = 0; p_src_long_vector[i] = 0;
} }
#endif
// load data from src to the long-vector buffer // load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
...@@ -121,9 +123,11 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -121,9 +123,11 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
1>(p_src, 1>(p_src,
src_coord.GetOffset(), src_coord.GetOffset(),
src_coord.IsOffsetValidAssumingUpperIndexIsValid(), src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
SrcDesc::GetElementSpace(),
p_src_long_vector, p_src_long_vector,
buffer_offset, buffer_offset,
true); true,
long_vector_size);
} }
// SrcData to DstData conversion // SrcData to DstData conversion
...@@ -156,326 +160,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -156,326 +160,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
DstDataStride>(p_dst_long_vector, DstDataStride>(p_dst_long_vector,
buffer_offset, buffer_offset,
true, true,
long_vector_size,
p_dst, p_dst,
dst_coord.GetOffset(), dst_coord.GetOffset(),
dst_coord.IsOffsetValidAssumingUpperIndexIsValid()); dst_coord.IsOffsetValidAssumingUpperIndexIsValid(),
} DstDesc::GetElementSpace());
});
}
// Modify Length to 1, if Mask is set to false
// Used for isolating linear dimension from non-linear dimensions
template <index_t... Lengths, index_t... Mask>
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
{
return Sequence<(Mask ? Lengths : 1)...>{};
}
// Will do valid mapping check on src data: Read 0 if src data has a invalid mapping
// Will do valid mapping check on dst data: No write if dst data has a invalid mapping
// This version is optimized for address calculation of src tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData, typename DstData>
__device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
DstData* p_dst) const
{
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerRead>{};
constexpr auto dst_data_per_access = Number<DstDataPerWrite>{};
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerRead, DstDataPerWrite)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// separate linear dimensions from non-linear dimensions
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
static_assert(
src_linear_dim_mask.At(SrcDstVectorReadWriteDim) || long_vector_size == SrcDataPerRead,
"Warning! SrcDstVectorReadWriteDim is not SrcDesc's linear dimension, performance "
"would drop");
// separate steps into linear and non-linear components, accoording to src tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
// loop over src's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
// calculate step-sizes along src's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// move src cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
// loop over src's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
// step-sizes along src's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
SrcData p_src_long_vector[long_vector_size];
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
// Loop over SrcDstVectorReadWriteDim, and load data from src to the
// long-vector buffer.
// If SrcDstVectorReadWriteDim is src's linear dimension, then src's
// offset-diff due to this looping is known at compile-time. If
// SrcDstVectorReadWriteDim is src's nonlinear dimension, then src's
// offset-diff due to this looping is only known at run-time. For best
// performance, SrcDstVectorReadWriteDim, should be src's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
// move src cooridnate along linear dimensions
const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is src compile-time offset
const index_t src_linear_offset =
src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else
// this is src compile-time offset
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#endif
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src,
src_nonlinear_coord.GetOffset() + src_linear_offset,
src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
p_src_long_vector,
buffer_offset,
true);
}
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
// dst offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp>(p_dst_long_vector,
buffer_offset,
true,
p_dst,
dst_coord.GetOffset(),
dst_coord.IsOffsetValidAssumingUpperIndexIsValid());
} }
}); });
});
}
// This version is optimized for address calculation of dst tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData, typename DstData>
__device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
DstData* p_dst) const
{
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerRead>{};
constexpr auto dst_data_per_access = Number<DstDataPerWrite>{};
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerRead, DstDataPerWrite)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// separate linear dimensions from non-linear dimensions
constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
static_assert(
dst_linear_dim_mask.At(SrcDstVectorReadWriteDim) || long_vector_size == DstDataPerWrite,
"Warning! SrcDstVectorReadWriteDim is not DstDesc's linear dimension, performance "
"would drop");
// separate steps into linear and non-linear components, accoording to dst tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_linear_dim_mask);
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask);
// loop over dst's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
// calculate step-sizes along dst's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// move dst cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps;
// loop over dst's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
// step-sizes along dst's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
SrcData p_src_long_vector[long_vector_size];
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
// Loop over SrcDstVectorReadWriteDim, and load data from src to the
// long-vector buffer.
// If SrcDstVectorReadWriteDim is dst's linear dimension, then dst's
// offset-diff due to this looping is known at compile-time. If
// SrcDstVectorReadWriteDim is dst's nonlinear dimension, then dst's
// offset-diff due to this looping is only known at run-time. For best
// performance, SrcDstVectorReadWriteDim, should be dst's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
// src offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src,
src_coord.GetOffset(),
src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
p_src_long_vector,
buffer_offset,
true);
}
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
// move dst cooridnate along linear dimensions
const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is dst compile-time offset
const index_t dst_linear_offset =
dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else
// this is dst compile-time offset
const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#endif
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp>(p_dst_long_vector,
buffer_offset,
true,
p_dst,
dst_nonlinear_coord.GetOffset() + dst_linear_offset,
dst_coord.IsOffsetValidAssumingUpperIndexIsValid());
}
});
});
}
__device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
{
#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
return true;
#else
return false;
#endif
} }
template <typename T, bool PositiveDirection> template <typename T, bool PositiveDirection>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment