Commit 0399af7d authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Fix most of compilation errors

parent 0ba41814
...@@ -63,10 +63,14 @@ struct Block2TileMap ...@@ -63,10 +63,14 @@ struct Block2TileMap
}); });
std::array<index_t, NumDim> divisors; std::array<index_t, NumDim> divisors;
std::partial_sum(rbegin(num_tiles_per_axis), index_t product = 1;
rend(num_tiles_per_axis), auto divisor = rbegin(divisors);
rbegin(divisors), for(auto num_tiles = rbegin(num_tiles_per_axis); num_tiles != rend(num_tiles_per_axis);
std::multiplies<index_t>{}); ++num_tiles)
{
product *= (*num_tiles);
*(divisor++) = product;
}
const index_t grid_size = divisors.front(); const index_t grid_size = divisors.front();
block_1d_id = block_1d_id % grid_size; // swallow batch index block_1d_id = block_1d_id % grid_size; // swallow batch index
...@@ -83,7 +87,7 @@ struct Block2TileMap ...@@ -83,7 +87,7 @@ struct Block2TileMap
}; };
} // namespace detail } // namespace detail
template <typename GridwiseCopyFunctor, template <typename GridwiseCopy,
typename InGrid1dDesc, typename InGrid1dDesc,
typename OutGrid1dDesc, typename OutGrid1dDesc,
typename InDataTypePointer, typename InDataTypePointer,
...@@ -97,12 +101,15 @@ __global__ void kernel_nd_copy(const InGrid1dDesc in_grid_1d_desc, ...@@ -97,12 +101,15 @@ __global__ void kernel_nd_copy(const InGrid1dDesc in_grid_1d_desc,
const ElementwiseOperation elementwise_op, const ElementwiseOperation elementwise_op,
const Block2TileMap block_2_tile_map) const Block2TileMap block_2_tile_map)
{ {
GridwiseCopyFunctor::Run(in_grid_1d_desc, __shared__ char p_shared[GridwiseCopy::GetSharedMemoryNumberOfByte()];
out_grid_1d_desc,
p_in_global, GridwiseCopy::Run(in_grid_1d_desc,
p_out_global, out_grid_1d_desc,
elementwise_op, p_in_global,
block_2_tile_map); p_out_global,
p_shared,
elementwise_op,
block_2_tile_map);
} }
template <typename InGrid1dDesc, template <typename InGrid1dDesc,
...@@ -140,11 +147,26 @@ struct GridwiseCopy ...@@ -140,11 +147,26 @@ struct GridwiseCopy
constexpr index_t ABlockLdsExtraM = 0; constexpr index_t ABlockLdsExtraM = 0;
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
return make_naive_tensor_descriptor( return make_naive_tensor_descriptor(make_tuple(1, Number<HPerBlock>{}, Number<WPerBlock>{}),
make_tuple(Number<NPerBlock>{}, Number<HPerBlock>{}, Number<WPerBlock>{}), make_tuple(Number<WPerBlock + ABlockLdsExtraM>{},
make_tuple(Number<NPerBlock + ABlockLdsExtraM>{} * Number<HPerBlock>{}, Number<WPerBlock + ABlockLdsExtraM>{},
Number<HPerBlock>{}, I1));
I1)); }
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
// LDS allocation for A and B: be careful of alignment
constexpr auto a_block_desc_ak0_m_ak1 = GetInBlockDescriptor();
using InDataType = remove_cv_t<remove_pointer_t<InDataTypePointer>>;
// lds max alignment
constexpr auto max_lds_align = 1;
constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
return a_block_space_size_aligned * sizeof(InDataType);
} }
__host__ __device__ static constexpr auto MakeDefaultBlock2TileMap(const InGrid1dDesc& desc) __host__ __device__ static constexpr auto MakeDefaultBlock2TileMap(const InGrid1dDesc& desc)
...@@ -157,30 +179,33 @@ struct GridwiseCopy ...@@ -157,30 +179,33 @@ struct GridwiseCopy
const OutGrid1dDesc out_grid_1d_desc, const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global, const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global, const OutDataTypePointer p_out_global,
void* __restrict__ p_shared,
const ElementwiseOperation elementwise_op, const ElementwiseOperation elementwise_op,
const Block2TileMap& block_2_tile_map) const Block2TileMap& block_2_tile_map)
{ {
const index_t thread_global_id = get_thread_global_1d_id(); // const index_t thread_global_id = get_thread_global_1d_id();
using InDataType = remove_cv_t<remove_pointer_t<InDataTypePointer>>; using InDataType = remove_cv_t<remove_pointer_t<InDataTypePointer>>;
auto in_thread_buf = StaticBuffer<AddressSpaceEnum::Vgpr, InDataType, MPerThread, true>{}; // auto in_thread_buf = StaticBuffer<AddressSpaceEnum::Vgpr, InDataType, MPerThread,
// true>{};
using OutDataType = remove_cv_t<remove_pointer_t<OutDataTypePointer>>; using OutDataType = remove_cv_t<remove_pointer_t<OutDataTypePointer>>;
auto out_thread_buf = StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MPerThread, true>{}; // auto out_thread_buf = StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MPerThread,
// true>{};
auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global, in_grid_1d_desc.GetElementSpaceSize()); p_in_global, in_grid_1d_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( // auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global, out_grid_1d_desc.GetElementSpaceSize()); // p_out_global, out_grid_1d_desc.GetElementSpaceSize());
const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread); // const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread);
const index_t blockSize = get_block_size(); // const index_t blockSize = get_block_size();
const index_t blockPerGrid = get_grid_size(); // const index_t blockPerGrid = get_grid_size();
const auto M = in_grid_1d_desc.GetLength(I0); // const auto M = in_grid_1d_desc.GetLength(I0);
const index_t loop_step = blockPerGrid * blockSize * MPerThread; // const index_t loop_step = blockPerGrid * blockSize * MPerThread;
const auto loop_step_index = make_multi_index(loop_step); const auto loop_step_index = make_multi_index(1, 0, 0);
#if 0 #if 0
auto in_global_load = auto in_global_load =
...@@ -198,9 +223,14 @@ struct GridwiseCopy ...@@ -198,9 +223,14 @@ struct GridwiseCopy
const auto block_work_idx = const auto block_work_idx =
block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
// constexpr auto max_lds_align = 1;
// HACK: this force m/n_block_data_idx_on_grid into SGPR // HACK: this force m/n_block_data_idx_on_grid into SGPR
const index_t m_block_data_idx_on_grid = const index_t h_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I0] * NPerBlock * HPerBlock); __builtin_amdgcn_readfirstlane(block_work_idx[I0] * HPerBlock);
const index_t w_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * WPerBlock);
// const index_t n_block_data_idx_on_grid = // const index_t n_block_data_idx_on_grid =
// __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); // __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
...@@ -211,12 +241,25 @@ struct GridwiseCopy ...@@ -211,12 +241,25 @@ struct GridwiseCopy
// // B matrix in LDS memory, dst of blockwise copy // // B matrix in LDS memory, dst of blockwise copy
// constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); // constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
// LDS allocation for A and B: be careful of alignment
// constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
// a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<InDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
// auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
// static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned,
// b_block_desc_bk0_n_bk1.GetElementSpaceSize());
using SliceLengths = Sequence<NPerBlock, HPerBlock, WPerBlock>; using SliceLengths = Sequence<NPerBlock, HPerBlock, WPerBlock>;
using ABlockTransferThreadClusterLengths_AK0_M_AK1 = Sequence<4, 64, 1>; using ABlockTransferThreadClusterLengths_AK0_M_AK1 = Sequence<4, 64, 1>;
using ABlockTransferThreadClusterArrangeOrder = Sequence<1, 0, 2>; using ABlockTransferThreadClusterArrangeOrder = Sequence<0, 1, 2>;
using ABlockTransferSrcAccessOrder = Sequence<1, 0, 2>; using ABlockTransferSrcAccessOrder = Sequence<0, 1, 2>;
using ABlockTransferDstAccessOrder = Sequence<0, 1, 2>;
constexpr index_t ABlockTransferSrcVectorDim = 2; constexpr index_t ABlockTransferSrcVectorDim = 2;
constexpr index_t ABlockTransferSrcScalarPerVector = 1; constexpr index_t ABlockTransferSrcScalarPerVector = 1;
constexpr index_t ABlockTransferDstVectorDim = 2;
constexpr index_t ABlockTransferDstScalarPerVector = 1; constexpr index_t ABlockTransferDstScalarPerVector = 1;
auto in_global_load = auto in_global_load =
...@@ -232,9 +275,9 @@ struct GridwiseCopy ...@@ -232,9 +275,9 @@ struct GridwiseCopy
decltype(in_grid_1d_desc), decltype(in_grid_1d_desc),
decltype(a_block_desc_ak0_m_ak1), decltype(a_block_desc_ak0_m_ak1),
ABlockTransferSrcAccessOrder, ABlockTransferSrcAccessOrder,
Sequence<1, 0, 2>, ABlockTransferDstAccessOrder,
ABlockTransferSrcVectorDim, ABlockTransferSrcVectorDim,
2, ABlockTransferDstVectorDim,
ABlockTransferSrcScalarPerVector, ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector, ABlockTransferDstScalarPerVector,
1, 1,
...@@ -242,55 +285,42 @@ struct GridwiseCopy ...@@ -242,55 +285,42 @@ struct GridwiseCopy
true, true,
true>( true>(
in_grid_1d_desc, in_grid_1d_desc,
make_multi_index(0, m_block_data_idx_on_grid, 0), make_multi_index(0, h_block_data_idx_on_grid, w_block_data_idx_on_grid),
elementwise_op, elementwise_op,
a_block_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1,
make_multi_index(0, 0, 0), make_multi_index(0, 0, 0),
ck::tensor_operation::element_wise::PassThrough{}); ck::tensor_operation::element_wise::PassThrough{});
#endif #endif
auto out_global_store = // auto out_global_store =
ThreadwiseTensorSliceTransfer_v1r3<OutDataType, // ThreadwiseTensorSliceTransfer_v1r3<OutDataType,
OutDataType, // OutDataType,
decltype(thread_buffer_desc_m), // decltype(thread_buffer_desc_m),
decltype(out_grid_1d_desc), // decltype(out_grid_1d_desc),
PassThroughOp, // PassThroughOp,
SliceLengths, // SliceLengths // SliceLengths, // SliceLengths
Sequence<1, 0, 2>, // DimAccessOrder // Sequence<1, 0, 2>, // DimAccessOrder
0, // SrcVectorDim // 0, // SrcVectorDim
OutScalarPerVector, // OutScalarPerVector,
InMemoryDataOperationEnum::Set, // InMemoryDataOperationEnum::Set,
1, // 1,
false>( // false>(
out_grid_1d_desc, thread_global_offset, PassThroughOp{}); // out_grid_1d_desc, thread_global_offset, PassThroughOp{});
index_t num_iter = M / (loop_step); index_t num_iter = in_grid_1d_desc.GetLength(I0);
do do
{ {
in_global_load.Run(in_grid_1d_desc, // in_global_load.Run(
in_global_buf, // in_grid_1d_desc, in_global_buf, a_block_desc_ak0_m_ak1, a_block_buf, I0);
thread_buffer_desc_m,
make_tuple(I0), // in_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index);
in_thread_buf);
// out_global_store.Run(thread_buffer_desc_m,
in_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index); // make_tuple(I0),
// out_thread_buf,
static_for<0, MPerThread, 1>{}([&](auto iM) { // out_grid_1d_desc,
// get reference to in data // out_global_buf);
const auto& in_data_ref = in_thread_buf(iM); //
// out_global_store.MoveDstSliceWindow(out_grid_1d_desc, loop_step_index);
// get reference to dst data
auto& out_data_ref = out_thread_buf(iM);
elementwise_op(out_data_ref, in_data_ref);
});
out_global_store.Run(thread_buffer_desc_m,
make_tuple(I0),
out_thread_buf,
out_grid_1d_desc,
out_global_buf);
out_global_store.MoveDstSliceWindow(out_grid_1d_desc, loop_step_index);
} while(--num_iter); } while(--num_iter);
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment