Commit c2e5822c authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Use the normal Block2TileMap convention

parent ed794598
...@@ -111,9 +111,7 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -111,9 +111,7 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
} }
static auto MakeDescriptor_N_H_W(const std::array<index_t, NumDim>& lengths, static auto MakeDescriptor_N_H_W(const std::array<index_t, NumDim>& lengths,
const std::array<index_t, NumDim>& stride, const std::array<index_t, NumDim>& stride)
index_t gridSize,
index_t blockSize)
{ {
// create nd descriptor, shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2], // create nd descriptor, shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2],
// d[NumDim-1]] // d[NumDim-1]]
...@@ -139,8 +137,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -139,8 +137,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
desc_n_h_w, make_tuple(NPerBlock, HPerBlock, WPerBlock), Sequence<true, true, true>{}); desc_n_h_w, make_tuple(NPerBlock, HPerBlock, WPerBlock), Sequence<true, true, true>{});
} }
using InGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}, 1, 1)); using InGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}));
using OutGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}, 1, 1)); using OutGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}));
using GridwiseCopy = GridwiseCopy<InGrid1dDesc, using GridwiseCopy = GridwiseCopy<InGrid1dDesc,
OutGrid1dDesc, OutGrid1dDesc,
...@@ -164,23 +162,19 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -164,23 +162,19 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
const void* in_dev_buffer, const void* in_dev_buffer,
void* out_dev_buffer, void* out_dev_buffer,
ElementwiseOperation elementwise_op) ElementwiseOperation elementwise_op)
: blockSize_(256), : in_dev_buffer_(static_cast<InDataTypePointer>(in_dev_buffer)),
gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future
in_dev_buffer_(static_cast<InDataTypePointer>(in_dev_buffer)),
out_dev_buffer_(static_cast<OutDataTypePointer>(out_dev_buffer)), out_dev_buffer_(static_cast<OutDataTypePointer>(out_dev_buffer)),
in_grid_1d_desc_(MakeDescriptor_N_H_W(inLengths, inStrides, gridSize_, blockSize_)), in_grid_1d_desc_(MakeDescriptor_N_H_W(inLengths, inStrides)),
out_grid_1d_desc_(MakeDescriptor_N_H_W(inLengths, inStrides, gridSize_, blockSize_)), out_grid_1d_desc_(MakeDescriptor_N_H_W(inLengths, inStrides)),
inLengths_(inLengths), inLengths_(inLengths),
inStrides_(inStrides), inStrides_(inStrides),
outLengths_(outLengths), outLengths_(outLengths),
outStrides_(outStrides), outStrides_(outStrides),
elementwise_op_(elementwise_op) elementwise_op_(elementwise_op),
block_2_tile_map_(GridwiseCopy::MakeDefaultBlock2TileMap(in_grid_1d_desc_))
{ {
} }
index_t blockSize_;
index_t gridSize_;
InDataTypePointer in_dev_buffer_; InDataTypePointer in_dev_buffer_;
OutDataTypePointer out_dev_buffer_; OutDataTypePointer out_dev_buffer_;
InGrid1dDesc in_grid_1d_desc_; InGrid1dDesc in_grid_1d_desc_;
...@@ -192,29 +186,35 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -192,29 +186,35 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
std::array<index_t, NumDim> outStrides_; std::array<index_t, NumDim> outStrides_;
ElementwiseOperation elementwise_op_; ElementwiseOperation elementwise_op_;
typename GridwiseCopy::DefaultBlock2TileMap block_2_tile_map_;
}; };
struct Invoker : detail::InvokerBase<Invoker, Argument> struct Invoker : detail::InvokerBase<Invoker, Argument>
{ {
static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{ {
const index_t grid_size = arg.block_2_tile_map_.CalculateGridSize(arg.in_grid_1d_desc_);
const auto kernel = kernel_nd_copy<GridwiseCopy, const auto kernel = kernel_nd_copy<GridwiseCopy,
InGrid1dDesc, InGrid1dDesc,
OutGrid1dDesc, OutGrid1dDesc,
InDataTypePointer, InDataTypePointer,
OutDataTypePointer, OutDataTypePointer,
ElementwiseOperation>; ElementwiseOperation,
typename GridwiseCopy::DefaultBlock2TileMap>;
float elapsed_time = launch_and_time_kernel(stream_config, float elapsed_time = launch_and_time_kernel(stream_config,
kernel, kernel,
dim3(arg.gridSize_), dim3(grid_size),
dim3(arg.blockSize_), dim3(BlockSize),
0, 0,
arg.in_grid_1d_desc_, arg.in_grid_1d_desc_,
arg.out_grid_1d_desc_, arg.out_grid_1d_desc_,
arg.in_dev_buffer_, arg.in_dev_buffer_,
arg.out_dev_buffer_, arg.out_dev_buffer_,
arg.elementwise_op_); arg.elementwise_op_,
arg.block_2_tile_map_);
return elapsed_time; return elapsed_time;
} }
}; };
......
...@@ -23,12 +23,28 @@ struct BlockToTileMap ...@@ -23,12 +23,28 @@ struct BlockToTileMap
static constexpr index_t NumDim = TileDims::Size(); static constexpr index_t NumDim = TileDims::Size();
static_assert(NumDim == GridDescriptor::GetNumOfDimension()); static_assert(NumDim == GridDescriptor::GetNumOfDimension());
BlockToTileMap() = delete; BlockToTileMap() = default;
~BlockToTileMap() = delete;
BlockToTileMap(const GridDescriptor& desc) : desc_(desc) {}
__host__ constexpr index_t CalculateGridSize(const GridDescriptor& desc) const
{
return [&]() {
std::array<index_t, NumDim> num_tiles_per_axis;
static_for<0, NumDim, 1>{}([&](auto I) {
num_tiles_per_axis[I] =
math::integer_divide_ceil(desc.GetLength(I), TileDims::At(I));
});
return std::accumulate(begin(num_tiles_per_axis),
end(num_tiles_per_axis),
index_t{1},
std::multiplies<index_t>{});
}();
}
template <typename TopIdx> template <typename TopIdx>
__host__ __device__ static constexpr auto CalculateBottomIndex(const GridDescriptor& desc, __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
const TopIdx& idx_top)
{ {
static_assert(TopIdx::Size() == 1); static_assert(TopIdx::Size() == 1);
...@@ -36,7 +52,7 @@ struct BlockToTileMap ...@@ -36,7 +52,7 @@ struct BlockToTileMap
std::array<index_t, NumDim> num_tiles_per_axis; std::array<index_t, NumDim> num_tiles_per_axis;
static_for<0, NumDim, 1>{}([&](auto I) { static_for<0, NumDim, 1>{}([&](auto I) {
num_tiles_per_axis[I] = math::integer_divide_ceil(desc.GetLength(I), TileDims::At(I)); num_tiles_per_axis[I] = math::integer_divide_ceil(desc_.GetLength(I), TileDims::At(I));
}); });
std::array<index_t, NumDim> divisors; std::array<index_t, NumDim> divisors;
...@@ -54,6 +70,9 @@ struct BlockToTileMap ...@@ -54,6 +70,9 @@ struct BlockToTileMap
}, },
Number<NumDim>{}); Number<NumDim>{});
} }
private:
const GridDescriptor desc_;
}; };
} // namespace detail } // namespace detail
...@@ -62,15 +81,21 @@ template <typename GridwiseCopyFunctor, ...@@ -62,15 +81,21 @@ template <typename GridwiseCopyFunctor,
typename OutGrid1dDesc, typename OutGrid1dDesc,
typename InDataTypePointer, typename InDataTypePointer,
typename OutDataTypePointer, typename OutDataTypePointer,
typename ElementwiseOperation> typename ElementwiseOperation,
typename Block2TileMap>
__global__ void kernel_nd_copy(const InGrid1dDesc in_grid_1d_desc, __global__ void kernel_nd_copy(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc, const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global, const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global, const OutDataTypePointer p_out_global,
const ElementwiseOperation elementwise_op) const ElementwiseOperation elementwise_op,
const Block2TileMap block_2_tile_map)
{ {
GridwiseCopyFunctor::Run( GridwiseCopyFunctor::Run(in_grid_1d_desc,
in_grid_1d_desc, out_grid_1d_desc, p_in_global, p_out_global, elementwise_op); out_grid_1d_desc,
p_in_global,
p_out_global,
elementwise_op,
block_2_tile_map);
} }
template <typename InGrid1dDesc, template <typename InGrid1dDesc,
...@@ -100,10 +125,10 @@ struct GridwiseCopy ...@@ -100,10 +125,10 @@ struct GridwiseCopy
using ThisThreadBlock = ThisThreadBlock<BlockSize>; using ThisThreadBlock = ThisThreadBlock<BlockSize>;
using BlockToTileMap = using DefaultBlock2TileMap =
detail::BlockToTileMap<Sequence<NPerBlock, HPerBlock, WPerBlock>, InGrid1dDesc>; detail::BlockToTileMap<Sequence<NPerBlock, HPerBlock, WPerBlock>, InGrid1dDesc>;
__host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() __host__ __device__ static constexpr auto GetInBlockDescriptor()
{ {
constexpr index_t ABlockLdsExtraM = 0; constexpr index_t ABlockLdsExtraM = 0;
...@@ -115,11 +140,18 @@ struct GridwiseCopy ...@@ -115,11 +140,18 @@ struct GridwiseCopy
I1)); I1));
} }
__host__ __device__ static constexpr auto MakeDefaultBlock2TileMap(const InGrid1dDesc& desc)
{
return DefaultBlock2TileMap{desc};
}
template <typename Block2TileMap>
__device__ static void Run(const InGrid1dDesc in_grid_1d_desc, __device__ static void Run(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc, const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global, const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global, const OutDataTypePointer p_out_global,
const ElementwiseOperation elementwise_op) const ElementwiseOperation elementwise_op,
const Block2TileMap& block_2_tile_map)
{ {
const index_t thread_global_id = get_thread_global_1d_id(); const index_t thread_global_id = get_thread_global_1d_id();
...@@ -156,8 +188,8 @@ struct GridwiseCopy ...@@ -156,8 +188,8 @@ struct GridwiseCopy
1, // SrcScalarStrideInVector 1, // SrcScalarStrideInVector
false>{in_grid_1d_desc, thread_global_offset}; false>{in_grid_1d_desc, thread_global_offset};
#else #else
const auto block_work_idx = BlockToTileMap::CalculateBottomIndex( const auto block_work_idx =
in_grid_1d_desc, make_multi_index(get_block_1d_id())); block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
// HACK: this force m/n_block_data_idx_on_grid into SGPR // HACK: this force m/n_block_data_idx_on_grid into SGPR
const index_t m_block_data_idx_on_grid = const index_t m_block_data_idx_on_grid =
...@@ -167,7 +199,7 @@ struct GridwiseCopy ...@@ -167,7 +199,7 @@ struct GridwiseCopy
// __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); // __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); constexpr auto a_block_desc_ak0_m_ak1 = GetInBlockDescriptor();
// // B matrix in LDS memory, dst of blockwise copy // // B matrix in LDS memory, dst of blockwise copy
// constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); // constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment