Commit b3782d46 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Add 'BlockSize' parameter to 'DevicePermute'

parent 6ab0e31f
......@@ -8,11 +8,11 @@ using BDataType = F16;
// clang-format off
using DevicePermuteInstance = ck::tensor_operation::device::DevicePermute
// ######| InData| OutData| Elementwise| NumDim| NPer| HPer| WPer|MPerThread| InScalar| OutScalar|
// ######| Type| Type| Operation| | Block| Block| Block| | PerVector| PerVector|
// ######| | | | | | | | | | |
// ######| | | | | | | | | | |
< ADataType, BDataType, PassThrough, 4, 128, 128, 128, 8, 8, 1>;
// ######| InData| OutData| Elementwise| NumDim| Block| NPer| HPer| WPer|MPerThread| InScalar| OutScalar|
// ######| Type| Type| Operation| | Size| Block| Block| Block| | PerVector| PerVector|
// ######| | | | | | | | | | | |
// ######| | | | | | | | | | | |
< ADataType, BDataType, PassThrough, 4, 256, 128, 128, 128, 8, 8, 1>;
// clang-format on
#include "run_permute_example.inc"
......
......@@ -10,6 +10,7 @@
#include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_copy.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
......@@ -77,6 +78,7 @@ template <typename InDataType,
typename OutDataType,
typename ElementwiseOperation,
index_t NumDim,
index_t BlockSize,
index_t NPerBlock,
index_t HPerBlock,
index_t WPerBlock,
......@@ -87,6 +89,7 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
OutDataType,
ElementwiseOperation,
NumDim,
BlockSize,
NPerBlock,
HPerBlock,
WPerBlock,
......@@ -148,19 +151,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
Sequence<NumDim - 1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
#if 0
const index_t N = std::accumulate(begin(lengths), std::prev(end(lengths), 2), index_t{1}, std::multiplies<index_t>{});
const auto desc_m = transform_tensor_descriptor(
desc_n_h_w,
make_tuple(make_merge_transform(make_tuple(N, H, W))),
make_tuple(Sequence<0, 1, 2>{}),
make_tuple(Sequence<0>{})
);
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
#else
return PadDescriptor_M_1d(desc_n_h_w, gridSize, blockSize);
#endif
return PadTensorDescriptor(
desc_n_h_w, make_tuple(NPerBlock, HPerBlock, WPerBlock), Sequence<true, true, true>{});
}
using InGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}, 1, 1));
......@@ -171,6 +163,10 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
InDataTypePointer,
OutDataTypePointer,
ElementwiseOperation,
BlockSize,
NPerBlock,
HPerBlock,
WPerBlock,
MPerThread,
InScalarPerVector,
OutScalarPerVector>;
......
......@@ -5,6 +5,7 @@
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
......@@ -31,18 +32,40 @@ template <typename InGrid1dDesc,
typename InDataTypePointer,
typename OutDataTypePointer,
typename ElementwiseOperation,
index_t BlockSize,
index_t NPerBlock,
index_t HPerBlock,
index_t WPerBlock,
index_t MPerThread,
index_t InScalarPerVector,
index_t OutScalarPerVector>
struct GridwiseCopy
{
static_assert(InGrid1dDesc::GetNumOfDimension() == 3 &&
OutGrid1dDesc::GetNumOfDimension() == 3);
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto thread_buffer_desc_m =
make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
using PassThroughOp = tensor_operation::element_wise::PassThrough;
using ThisThreadBlock = ThisThreadBlock<BlockSize>;
__host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
{
constexpr index_t ABlockLdsExtraM = 0;
// A matrix in LDS memory, dst of blockwise copy
return make_naive_tensor_descriptor(
make_tuple(Number<NPerBlock>{}, Number<HPerBlock>{}, Number<WPerBlock>{}),
make_tuple(Number<NPerBlock + ABlockLdsExtraM>{} * Number<HPerBlock>{},
Number<HPerBlock>{},
I1));
}
__device__ static void Run(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global,
......@@ -71,6 +94,7 @@ struct GridwiseCopy
const index_t loop_step = blockPerGrid * blockSize * MPerThread;
const auto loop_step_index = make_multi_index(loop_step);
#if 1
auto in_global_load =
ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType,
......@@ -82,7 +106,60 @@ struct GridwiseCopy
InScalarPerVector, // ScalarPerVector
1, // SrcScalarStrideInVector
false>{in_grid_1d_desc, thread_global_offset};
#else
// const auto block_work_idx =
// block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
// HACK: this force m/n_block_data_idx_on_grid into SGPR
const index_t m_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I0] * NPerBlock * HPerBlock);
// const index_t n_block_data_idx_on_grid =
// __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
// A matrix in LDS memory, dst of blockwise copy
constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
// // B matrix in LDS memory, dst of blockwise copy
// constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
using SliceLengths = Sequence<NPerBlock, HPerBlock, WPerBlock>;
using ABlockTransferThreadClusterLengths_AK0_M_AK1 = Sequence<4, 64, 1>;
using ABlockTransferThreadClusterArrangeOrder = Sequence<1, 0, 2>;
using ABlockTransferSrcAccessOrder = int;
constexpr index_t ABlockTransferSrcVectorDim = 2;
constexpr index_t ABlockTransferSrcScalarPerVector = 1;
constexpr index_t ABlockTransferDstScalarPerVector = 1;
auto in_global_load =
ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
ElementwiseOperation,
ck::tensor_operation::element_wise::PassThrough,
InMemoryDataOperationEnum::Set,
SliceLengths,
ABlockTransferThreadClusterLengths_AK0_M_AK1,
ABlockTransferThreadClusterArrangeOrder,
InDataType,
InDataType,
decltype(in_grid_1d_desc),
decltype(a_block_desc_ak0_m_ak1),
ABlockTransferSrcAccessOrder,
Sequence<1, 0, 2>,
ABlockTransferSrcVectorDim,
2,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector,
1,
1,
true,
true>(
in_grid_1d_desc,
make_multi_index(0, m_block_data_idx_on_grid, 0),
element_op,
a_block_desc_ak0_m_ak1,
make_multi_index(0, 0, 0),
ck::tensor_operation::element_wise::PassThrough{});
#endif
auto out_global_store =
ThreadwiseTensorSliceTransfer_v1r3<OutDataType,
OutDataType,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment