Commit 29053edd authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Allow data transfer in 'GridwiseCopy'

parent e3e84e91
...@@ -198,8 +198,8 @@ struct GridwiseCopy ...@@ -198,8 +198,8 @@ struct GridwiseCopy
auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global, in_grid_1d_desc.GetElementSpaceSize()); p_in_global, in_grid_1d_desc.GetElementSpaceSize());
// auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
// p_out_global, out_grid_1d_desc.GetElementSpaceSize()); p_out_global, out_grid_1d_desc.GetElementSpaceSize());
// const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread); // const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread);
...@@ -254,14 +254,14 @@ struct GridwiseCopy ...@@ -254,14 +254,14 @@ struct GridwiseCopy
// static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned, // static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned,
// b_block_desc_bk0_n_bk1.GetElementSpaceSize()); // b_block_desc_bk0_n_bk1.GetElementSpaceSize());
using SliceLengths = Sequence<NPerBlock, HPerBlock, WPerBlock>; using SliceLengths = Sequence<1, HPerBlock, WPerBlock>;
using ABlockTransferThreadClusterLengths_AK0_M_AK1 = Sequence<4, 64, 1>; using ABlockTransferThreadClusterLengths = Sequence<1, 16, BlockSize / 16>;
using ABlockTransferThreadClusterArrangeOrder = Sequence<0, 1, 2>; using ABlockTransferThreadClusterArrangeOrder = Sequence<0, 1, 2>;
using ABlockTransferSrcAccessOrder = Sequence<0, 1, 2>; using ABlockTransferSrcAccessOrder = Sequence<0, 1, 2>;
using ABlockTransferDstAccessOrder = Sequence<0, 1, 2>; using ABlockTransferDstAccessOrder = Sequence<0, 1, 2>;
constexpr index_t ABlockTransferSrcVectorDim = 2; constexpr index_t ABlockTransferSrcVectorDim = 2;
constexpr index_t ABlockTransferSrcScalarPerVector = 1;
constexpr index_t ABlockTransferDstVectorDim = 2; constexpr index_t ABlockTransferDstVectorDim = 2;
constexpr index_t ABlockTransferSrcScalarPerVector = 1;
constexpr index_t ABlockTransferDstScalarPerVector = 1; constexpr index_t ABlockTransferDstScalarPerVector = 1;
auto in_global_load = auto in_global_load =
...@@ -270,7 +270,7 @@ struct GridwiseCopy ...@@ -270,7 +270,7 @@ struct GridwiseCopy
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
InMemoryDataOperationEnum::Set, InMemoryDataOperationEnum::Set,
SliceLengths, SliceLengths,
ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterLengths,
ABlockTransferThreadClusterArrangeOrder, ABlockTransferThreadClusterArrangeOrder,
InDataType, InDataType,
InDataType, InDataType,
...@@ -288,41 +288,51 @@ struct GridwiseCopy ...@@ -288,41 +288,51 @@ struct GridwiseCopy
true>( true>(
in_grid_1d_desc, in_grid_1d_desc,
make_multi_index(0, h_block_data_idx_on_grid, w_block_data_idx_on_grid), make_multi_index(0, h_block_data_idx_on_grid, w_block_data_idx_on_grid),
elementwise_op, ck::tensor_operation::element_wise::PassThrough{},
a_block_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1,
make_multi_index(0, 0, 0), make_multi_index(0, 0, 0),
ck::tensor_operation::element_wise::PassThrough{}); ck::tensor_operation::element_wise::PassThrough{});
#endif #endif
// auto out_global_store = auto out_global_store = ThreadGroupTensorSliceTransfer_v4r1<
// ThreadwiseTensorSliceTransfer_v1r3<OutDataType, ThisThreadBlock,
// OutDataType, ElementwiseOperation,
// decltype(thread_buffer_desc_m), ck::tensor_operation::element_wise::PassThrough,
// decltype(out_grid_1d_desc), InMemoryDataOperationEnum::Set,
// PassThroughOp, Sequence<1, WPerBlock, HPerBlock>, // SliceLengths
// SliceLengths, // SliceLengths ABlockTransferThreadClusterLengths,
// Sequence<1, 0, 2>, // DimAccessOrder Sequence<0, 1, 2>, // ABlockTransferThreadClusterArrangeOrder
// 0, // SrcVectorDim InDataType,
// OutScalarPerVector, OutDataType,
// InMemoryDataOperationEnum::Set, decltype(a_block_desc_ak0_m_ak1),
// 1, decltype(out_grid_1d_desc),
// false>( Sequence<0, 1, 2>, // ABlockTransferSrcAccessOrder
// out_grid_1d_desc, thread_global_offset, PassThroughOp{}); Sequence<0, 2, 1>, // ABlockTransferDstAccessOrder
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferDstVectorDim
1, // ABlockTransferSrcScalarPerVector
1, // ABlockTransferDstScalarPerVector
1,
1,
true,
true>(a_block_desc_ak0_m_ak1,
make_multi_index(0, 0, 0),
ck::tensor_operation::element_wise::PassThrough{},
out_grid_1d_desc,
make_multi_index(0, w_block_data_idx_on_grid, h_block_data_idx_on_grid),
elementwise_op);
index_t num_iter = in_grid_1d_desc.GetLength(I0); index_t num_iter = in_grid_1d_desc.GetLength(I0);
do do
{ {
// in_global_load.Run( in_global_load.Run(
// in_grid_1d_desc, in_global_buf, a_block_desc_ak0_m_ak1, a_block_buf, I0); in_grid_1d_desc, in_global_buf, a_block_desc_ak0_m_ak1, a_block_buf, I0);
// in_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index); in_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index);
// out_global_store.Run(thread_buffer_desc_m, out_global_store.Run(
// make_tuple(I0), a_block_desc_ak0_m_ak1, a_block_buf, out_grid_1d_desc, out_global_buf, I0);
// out_thread_buf,
// out_grid_1d_desc, out_global_store.MoveDstSliceWindow(out_grid_1d_desc, loop_step_index);
// out_global_buf);
//
// out_global_store.MoveDstSliceWindow(out_grid_1d_desc, loop_step_index);
} while(--num_iter); } while(--num_iter);
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment