Commit c2e5822c authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Use the normal Block2TileMap convention

parent ed794598
......@@ -111,9 +111,7 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
}
static auto MakeDescriptor_N_H_W(const std::array<index_t, NumDim>& lengths,
const std::array<index_t, NumDim>& stride,
index_t gridSize,
index_t blockSize)
const std::array<index_t, NumDim>& stride)
{
// create nd descriptor, shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2],
// d[NumDim-1]]
......@@ -139,8 +137,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
desc_n_h_w, make_tuple(NPerBlock, HPerBlock, WPerBlock), Sequence<true, true, true>{});
}
using InGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}, 1, 1));
using OutGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}, 1, 1));
using InGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}));
using OutGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}));
using GridwiseCopy = GridwiseCopy<InGrid1dDesc,
OutGrid1dDesc,
......@@ -164,23 +162,19 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
const void* in_dev_buffer,
void* out_dev_buffer,
ElementwiseOperation elementwise_op)
: blockSize_(256),
gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future
in_dev_buffer_(static_cast<InDataTypePointer>(in_dev_buffer)),
: in_dev_buffer_(static_cast<InDataTypePointer>(in_dev_buffer)),
out_dev_buffer_(static_cast<OutDataTypePointer>(out_dev_buffer)),
in_grid_1d_desc_(MakeDescriptor_N_H_W(inLengths, inStrides, gridSize_, blockSize_)),
out_grid_1d_desc_(MakeDescriptor_N_H_W(inLengths, inStrides, gridSize_, blockSize_)),
in_grid_1d_desc_(MakeDescriptor_N_H_W(inLengths, inStrides)),
out_grid_1d_desc_(MakeDescriptor_N_H_W(inLengths, inStrides)),
inLengths_(inLengths),
inStrides_(inStrides),
outLengths_(outLengths),
outStrides_(outStrides),
elementwise_op_(elementwise_op)
elementwise_op_(elementwise_op),
block_2_tile_map_(GridwiseCopy::MakeDefaultBlock2TileMap(in_grid_1d_desc_))
{
}
index_t blockSize_;
index_t gridSize_;
InDataTypePointer in_dev_buffer_;
OutDataTypePointer out_dev_buffer_;
InGrid1dDesc in_grid_1d_desc_;
......@@ -192,29 +186,35 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
std::array<index_t, NumDim> outStrides_;
ElementwiseOperation elementwise_op_;
typename GridwiseCopy::DefaultBlock2TileMap block_2_tile_map_;
};
struct Invoker : detail::InvokerBase<Invoker, Argument>
{
static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const index_t grid_size = arg.block_2_tile_map_.CalculateGridSize(arg.in_grid_1d_desc_);
const auto kernel = kernel_nd_copy<GridwiseCopy,
InGrid1dDesc,
OutGrid1dDesc,
InDataTypePointer,
OutDataTypePointer,
ElementwiseOperation>;
ElementwiseOperation,
typename GridwiseCopy::DefaultBlock2TileMap>;
float elapsed_time = launch_and_time_kernel(stream_config,
kernel,
dim3(arg.gridSize_),
dim3(arg.blockSize_),
dim3(grid_size),
dim3(BlockSize),
0,
arg.in_grid_1d_desc_,
arg.out_grid_1d_desc_,
arg.in_dev_buffer_,
arg.out_dev_buffer_,
arg.elementwise_op_);
arg.elementwise_op_,
arg.block_2_tile_map_);
return elapsed_time;
}
};
......
......@@ -23,12 +23,28 @@ struct BlockToTileMap
static constexpr index_t NumDim = TileDims::Size();
static_assert(NumDim == GridDescriptor::GetNumOfDimension());
BlockToTileMap() = delete;
~BlockToTileMap() = delete;
BlockToTileMap() = default;
BlockToTileMap(const GridDescriptor& desc) : desc_(desc) {}
__host__ constexpr index_t CalculateGridSize(const GridDescriptor& desc) const
{
return [&]() {
std::array<index_t, NumDim> num_tiles_per_axis;
static_for<0, NumDim, 1>{}([&](auto I) {
num_tiles_per_axis[I] =
math::integer_divide_ceil(desc.GetLength(I), TileDims::At(I));
});
return std::accumulate(begin(num_tiles_per_axis),
end(num_tiles_per_axis),
index_t{1},
std::multiplies<index_t>{});
}();
}
template <typename TopIdx>
__host__ __device__ static constexpr auto CalculateBottomIndex(const GridDescriptor& desc,
const TopIdx& idx_top)
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
{
static_assert(TopIdx::Size() == 1);
......@@ -36,7 +52,7 @@ struct BlockToTileMap
std::array<index_t, NumDim> num_tiles_per_axis;
static_for<0, NumDim, 1>{}([&](auto I) {
num_tiles_per_axis[I] = math::integer_divide_ceil(desc.GetLength(I), TileDims::At(I));
num_tiles_per_axis[I] = math::integer_divide_ceil(desc_.GetLength(I), TileDims::At(I));
});
std::array<index_t, NumDim> divisors;
......@@ -54,6 +70,9 @@ struct BlockToTileMap
},
Number<NumDim>{});
}
private:
const GridDescriptor desc_;
};
} // namespace detail
......@@ -62,15 +81,21 @@ template <typename GridwiseCopyFunctor,
typename OutGrid1dDesc,
typename InDataTypePointer,
typename OutDataTypePointer,
typename ElementwiseOperation>
typename ElementwiseOperation,
typename Block2TileMap>
__global__ void kernel_nd_copy(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global,
const ElementwiseOperation elementwise_op)
const ElementwiseOperation elementwise_op,
const Block2TileMap block_2_tile_map)
{
GridwiseCopyFunctor::Run(
in_grid_1d_desc, out_grid_1d_desc, p_in_global, p_out_global, elementwise_op);
GridwiseCopyFunctor::Run(in_grid_1d_desc,
out_grid_1d_desc,
p_in_global,
p_out_global,
elementwise_op,
block_2_tile_map);
}
template <typename InGrid1dDesc,
......@@ -100,10 +125,10 @@ struct GridwiseCopy
using ThisThreadBlock = ThisThreadBlock<BlockSize>;
using BlockToTileMap =
using DefaultBlock2TileMap =
detail::BlockToTileMap<Sequence<NPerBlock, HPerBlock, WPerBlock>, InGrid1dDesc>;
__host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
__host__ __device__ static constexpr auto GetInBlockDescriptor()
{
constexpr index_t ABlockLdsExtraM = 0;
......@@ -115,11 +140,18 @@ struct GridwiseCopy
I1));
}
__host__ __device__ static constexpr auto MakeDefaultBlock2TileMap(const InGrid1dDesc& desc)
{
return DefaultBlock2TileMap{desc};
}
template <typename Block2TileMap>
__device__ static void Run(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global,
const ElementwiseOperation elementwise_op)
const ElementwiseOperation elementwise_op,
const Block2TileMap& block_2_tile_map)
{
const index_t thread_global_id = get_thread_global_1d_id();
......@@ -156,8 +188,8 @@ struct GridwiseCopy
1, // SrcScalarStrideInVector
false>{in_grid_1d_desc, thread_global_offset};
#else
const auto block_work_idx = BlockToTileMap::CalculateBottomIndex(
in_grid_1d_desc, make_multi_index(get_block_1d_id()));
const auto block_work_idx =
block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
// HACK: this force m/n_block_data_idx_on_grid into SGPR
const index_t m_block_data_idx_on_grid =
......@@ -167,7 +199,7 @@ struct GridwiseCopy
// __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
// A matrix in LDS memory, dst of blockwise copy
constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
constexpr auto a_block_desc_ak0_m_ak1 = GetInBlockDescriptor();
// // B matrix in LDS memory, dst of blockwise copy
// constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment