"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "753b98b59ac317af36e856125eecf33eccbcde8c"
Commit 0f52c4c0 authored by Chao Liu's avatar Chao Liu
Browse files

added type conversion in threadwise and blockwise copy

parent b3d4595f
...@@ -437,9 +437,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -437,9 +437,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
Float, Float,
address_space_t::generic, address_space_t::generic,
address_space_t::global> address_space_t::global>
#elif 1 #else // tweaking
.template Run_optimized_dst_address_calculation<Float, .template Run_optimized_dst_address_calculation<Float,
Float, Float,
address_space_t::generic,
address_space_t::global> address_space_t::global>
#endif #endif
(p_out_thread, p_out_global); (p_out_thread, p_out_global);
......
...@@ -772,7 +772,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -772,7 +772,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
BlockSrcAddressSpace, BlockSrcAddressSpace,
ThreadBufferAddressSpace>(p_block_src, ThreadBufferAddressSpace>(p_block_src,
p_thread_buffer); p_thread_buffer);
#else #else // tweaking
mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData, mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
ThreadBufferData, ThreadBufferData,
BlockSrcAddressSpace, BlockSrcAddressSpace,
...@@ -793,7 +793,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -793,7 +793,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
BlockDstData, BlockDstData,
ThreadBufferAddressSpace, ThreadBufferAddressSpace,
BlockDstAddressSpace>(p_thread_buffer, p_block_dst); BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
#else #else // tweaking
mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData, mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData,
BlockDstData, BlockDstData,
ThreadBufferAddressSpace, ThreadBufferAddressSpace,
......
...@@ -1226,7 +1226,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1226,7 +1226,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id); const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
// Check src vector's padding situation, only check the first data in this src // Check src vector's padding situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector has // vector. It's user's responsiblity to make sure all data in the src vector
// has
// the same padding situation // the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset()) if(src_coord.IsUpperIndexMappedToValidOffset())
{ {
...@@ -1266,7 +1267,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1266,7 +1267,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id); const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
// Check dst vector's padding situation, only check the first data in this dst // Check dst vector's padding situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector has // vector. It's user's responsiblity to make sure all data in the dst vector
// has
// the same padding situation // the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset()) if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
...@@ -1305,13 +1307,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1305,13 +1307,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area. // Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of src tensor // This version is optimized for address calculation of src tensor
template <typename TData, template <typename SrcData,
typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic, address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic> address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run_optimized_src_address_calculation(const TData* p_src, TData* p_dst) const __device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
DstData* p_dst) const
{ {
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{}; constexpr auto vector_access_dim = Number<VectorAccessDim>{};
...@@ -1327,9 +1331,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1327,9 +1331,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask(); constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask(); constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
static_assert( static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
src_linear_dim_mask.At(VectorAccessDim) || long_vector_size == SrcDataPerAccess, long_vector_size == SrcDataPerAccess,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance would drop"); "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
"would drop");
// separate steps into linear and non-linear components, accoording to src tensor // separate steps into linear and non-linear components, accoording to src tensor
constexpr auto linear_long_vector_access_lengths = constexpr auto linear_long_vector_access_lengths =
...@@ -1361,12 +1366,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1361,12 +1366,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector // buffer to hold a long-vector
TData p_long_vector[long_vector_size]; SrcData p_src_long_vector[long_vector_size];
// zero out buffer // zero out buffer
for(index_t i = 0; i < long_vector_size; ++i) for(index_t i = 0; i < long_vector_size; ++i)
{ {
p_long_vector[i] = 0; p_src_long_vector[i] = 0;
} }
// Loop over VectorAccessDim, and load data from src to the // Loop over VectorAccessDim, and load data from src to the
...@@ -1399,22 +1404,30 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1399,22 +1404,30 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) { static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<TData, SrcDataPerAccess>( __buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset); p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
#else #else
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>( *reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>( *reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
}); });
} }
} }
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
// store data from the long-vector buffer to dst // store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{ {
...@@ -1434,7 +1447,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1434,7 +1447,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
if(dst_coord.IsUpperIndexMappedToValidOffset()) if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
} }
} }
}); });
...@@ -1447,13 +1460,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1447,13 +1460,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area. // Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor // This version is optimized for address calculation of dst tensor
template <typename TData, template <typename SrcData,
typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic, address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic> address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run_optimized_dst_address_calculation(const TData* p_src, TData* p_dst) const __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
DstData* p_dst) const
{ {
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{}; constexpr auto vector_access_dim = Number<VectorAccessDim>{};
...@@ -1469,9 +1484,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1469,9 +1484,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask(); constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask(); constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
static_assert( static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess, long_vector_size == DstDataPerAccess,
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance would drop"); "Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
"would drop");
// separate steps into linear and non-linear components, accoording to dst tensor // separate steps into linear and non-linear components, accoording to dst tensor
constexpr auto linear_long_vector_access_lengths = constexpr auto linear_long_vector_access_lengths =
...@@ -1503,12 +1519,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1503,12 +1519,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector // buffer to hold a long-vector
TData p_long_vector[long_vector_size]; SrcData p_src_long_vector[long_vector_size];
// zero out buffer // zero out buffer
for(index_t i = 0; i < long_vector_size; ++i) for(index_t i = 0; i < long_vector_size; ++i)
{ {
p_long_vector[i] = 0; p_src_long_vector[i] = 0;
} }
// Loop over VectorAccessDim, and load data from src to the // Loop over VectorAccessDim, and load data from src to the
...@@ -1535,11 +1551,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1535,11 +1551,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the src vector has the same padding situation // the src vector has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset()) if(src_coord.IsUpperIndexMappedToValidOffset())
{ {
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]); *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
} }
} }
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
// store data from the long-vector buffer to dst // store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{ {
...@@ -1564,20 +1588,20 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1564,20 +1588,20 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static_if<DstAddressSpace == address_space_t::global>{}([&](auto) { static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store<TData, DstDataPerAccess>( __buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst, p_dst,
dst_nonlinear_coord.GetOffset(), dst_nonlinear_coord.GetOffset(),
dst_linear_offset); dst_linear_offset);
#else #else
*reinterpret_cast<dst_vector_t*>( *reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
*reinterpret_cast<dst_vector_t*>( *reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
}); });
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment