"...composable_kernel_rocm.git" did not exist on "7a7fe160866b7b2893be698d77b70cc8cf754fb5"
Commit c1e24c09 authored by Jing Zhang's avatar Jing Zhang
Browse files

clean code

parent 7abc0752
...@@ -85,7 +85,7 @@ struct BlockwiseGenericTensorSliceCopy_v5 ...@@ -85,7 +85,7 @@ struct BlockwiseGenericTensorSliceCopy_v5
if(BlockSize == mThreadClusterDesc.GetElementSize() or if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize()) get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{ {
mThreadwiseCopy.Load(p_block_src, p_thread_buffer); mThreadwiseCopy.Load(p_block_src);
} }
} }
...@@ -96,7 +96,7 @@ struct BlockwiseGenericTensorSliceCopy_v5 ...@@ -96,7 +96,7 @@ struct BlockwiseGenericTensorSliceCopy_v5
if(BlockSize == mThreadClusterDesc.GetElementSize() or if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize()) get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{ {
mThreadwiseCopy.Store(p_thread_buffer, p_block_dst); mThreadwiseCopy.Store(p_block_dst);
} }
} }
...@@ -115,8 +115,6 @@ struct BlockwiseGenericTensorSliceCopy_v5 ...@@ -115,8 +115,6 @@ struct BlockwiseGenericTensorSliceCopy_v5
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize()) get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{ {
RunLoadThreadBuffer(p_block_src, p_thread_buffer); RunLoadThreadBuffer(p_block_src, p_thread_buffer);
// if there is type conversion, it's done during store
RunStoreThreadBuffer(p_thread_buffer, p_block_dst); RunStoreThreadBuffer(p_thread_buffer, p_block_dst);
} }
} }
......
...@@ -75,8 +75,8 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -75,8 +75,8 @@ struct ThreadwiseGenericTensorSliceCopy_v5
mDstSliceOrigin = dst_slice_origin; mDstSliceOrigin = dst_slice_origin;
} }
template <typename SrcData, typename DstData> template <typename SrcData>
__device__ void Load(const SrcData* p_src, DstData* p_dst) __device__ void Load(const SrcData* p_src)
{ {
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{}; constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
...@@ -92,18 +92,17 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -92,18 +92,17 @@ struct ThreadwiseGenericTensorSliceCopy_v5
ford<decltype(long_vector_access_lengths), SrcDstDimAccessOrder>{}( ford<decltype(long_vector_access_lengths), SrcDstDimAccessOrder>{}(
[&](auto long_vector_access_id) { [&](auto long_vector_access_id) {
// data id w.r.t slicing-window // data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id; auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) = long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim]; long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a src long-vector // buffer to hold a src long-vector
SrcData p_src_long_vector[long_vector_size]; SrcData long_vector[long_vector_size];
#if 1 #if 1
// zero out buffer // zero out buffer
static_for<0, long_vector_size, 1>{}([&](auto i) { p_src_long_vector[i] = 0; }); static_for<0, long_vector_size, 1>{}([&](auto i) { long_vector[i] = 0; });
#endif #endif
// load data from src to the long-vector buffer // load data from src to the long-vector buffer
...@@ -130,19 +129,12 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -130,19 +129,12 @@ struct ThreadwiseGenericTensorSliceCopy_v5
src_coord.GetOffset(), src_coord.GetOffset(),
src_coord.IsOffsetValidAssumingUpperIndexIsValid(), src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
SrcDesc::GetElementSpace(), SrcDesc::GetElementSpace(),
p_src_long_vector, long_vector,
buffer_offset, buffer_offset,
true, true,
long_vector_size); long_vector_size);
}); });
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
static_for<0, long_vector_size, 1>{}([&](auto i) {
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
});
// store data from the long-vector buffer to dst // store data from the long-vector buffer to dst
static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) { static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
auto scalar_id = make_zero_multi_index<nDim>(); auto scalar_id = make_zero_multi_index<nDim>();
...@@ -153,34 +145,15 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -153,34 +145,15 @@ struct ThreadwiseGenericTensorSliceCopy_v5
const auto dst_coord = const auto dst_coord =
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id); mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
auto buff_off =
auto buff_off = ThreadBufferDesc::CalculateOffset(long_vector_data_begin_id + scalar_id); ThreadBufferDesc::CalculateOffset(long_vector_data_begin_id + scalar_id);
thread_buff[buff_off] = p_dst_long_vector[buffer_offset]; thread_buff[buff_off] = long_vector[buffer_offset];
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
//transfer_data<DstData,
//DstDataPerWrite,
//AddressSpace::Vgpr,
//DstAddressSpace,
//DstInMemOp,
//1,
//DstDataStride>(p_dst_long_vector,
//buffer_offset,
//true,
//long_vector_size,
//thread_buff,
//dst_coord.GetOffset(),
//dst_coord.IsOffsetValidAssumingUpperIndexIsValid(),
//DstDesc::GetElementSpace());
}); });
}); });
} }
template <typename SrcData, typename DstData> template <typename DstData>
__device__ void Store(const SrcData* p_src, DstData* p_dst) __device__ void Store(DstData* p_dst)
{ {
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{}; constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
...@@ -196,18 +169,17 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -196,18 +169,17 @@ struct ThreadwiseGenericTensorSliceCopy_v5
ford<decltype(long_vector_access_lengths), SrcDstDimAccessOrder>{}( ford<decltype(long_vector_access_lengths), SrcDstDimAccessOrder>{}(
[&](auto long_vector_access_id) { [&](auto long_vector_access_id) {
// data id w.r.t slicing-window // data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id; auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) = long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim]; long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a src long-vector // buffer to hold a src long-vector
SrcData p_src_long_vector[long_vector_size]; DstData long_vector[long_vector_size];
#if 1 #if 1
// zero out buffer // zero out buffer
static_for<0, long_vector_size, 1>{}([&](auto i) { p_src_long_vector[i] = 0; }); static_for<0, long_vector_size, 1>{}([&](auto i) { long_vector[i] = 0; });
#endif #endif
// load data from src to the long-vector buffer // load data from src to the long-vector buffer
...@@ -217,40 +189,10 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -217,40 +189,10 @@ struct ThreadwiseGenericTensorSliceCopy_v5
const index_t buffer_offset = i * src_data_per_access; const index_t buffer_offset = i * src_data_per_access;
auto buff_off = ThreadBufferDesc::CalculateOffset(long_vector_data_begin_id + scalar_id); auto buff_off =
ThreadBufferDesc::CalculateOffset(long_vector_data_begin_id + scalar_id);
p_src_long_vector[buffer_offset] = thread_buff[buff_off];
//const auto src_coord =
//mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
//transfer_data<SrcData,
//SrcDataPerRead,
//SrcAddressSpace,
//AddressSpace::Vgpr,
//InMemoryDataOperation::Set,
//SrcDataStride,
//1>(thread_buff,
//src_coord.GetOffset(),
//src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
//SrcDesc::GetElementSpace(),
//p_src_long_vector,
//buffer_offset,
//true,
//long_vector_size);
});
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
static_for<0, long_vector_size, 1>{}([&](auto i) { long_vector[buffer_offset] = thread_buff[buff_off];
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}); });
// store data from the long-vector buffer to dst // store data from the long-vector buffer to dst
...@@ -273,7 +215,7 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -273,7 +215,7 @@ struct ThreadwiseGenericTensorSliceCopy_v5
DstAddressSpace, DstAddressSpace,
DstInMemOp, DstInMemOp,
1, 1,
DstDataStride>(p_dst_long_vector, DstDataStride>(long_vector,
buffer_offset, buffer_offset,
true, true,
long_vector_size, long_vector_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment