"...composable_kernel_rocm.git" did not exist on "2eeeb1766b3274699e7cee8f87e535b2f93e4293"
Commit b3782d46 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Add 'BlockSize' parameter to 'DevicePermute'

parent 6ab0e31f
...@@ -8,11 +8,11 @@ using BDataType = F16; ...@@ -8,11 +8,11 @@ using BDataType = F16;
// clang-format off // clang-format off
using DevicePermuteInstance = ck::tensor_operation::device::DevicePermute using DevicePermuteInstance = ck::tensor_operation::device::DevicePermute
// ######| InData| OutData| Elementwise| NumDim| NPer| HPer| WPer|MPerThread| InScalar| OutScalar| // ######| InData| OutData| Elementwise| NumDim| Block| NPer| HPer| WPer|MPerThread| InScalar| OutScalar|
// ######| Type| Type| Operation| | Block| Block| Block| | PerVector| PerVector| // ######| Type| Type| Operation| | Size| Block| Block| Block| | PerVector| PerVector|
// ######| | | | | | | | | | | // ######| | | | | | | | | | | |
// ######| | | | | | | | | | | // ######| | | | | | | | | | | |
< ADataType, BDataType, PassThrough, 4, 128, 128, 128, 8, 8, 1>; < ADataType, BDataType, PassThrough, 4, 256, 128, 128, 128, 8, 8, 1>;
// clang-format on // clang-format on
#include "run_permute_example.inc" #include "run_permute_example.inc"
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "ck/utility/math.hpp" #include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp" #include "ck/utility/sequence.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_copy.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_copy.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
...@@ -77,6 +78,7 @@ template <typename InDataType, ...@@ -77,6 +78,7 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
typename ElementwiseOperation, typename ElementwiseOperation,
index_t NumDim, index_t NumDim,
index_t BlockSize,
index_t NPerBlock, index_t NPerBlock,
index_t HPerBlock, index_t HPerBlock,
index_t WPerBlock, index_t WPerBlock,
...@@ -87,6 +89,7 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -87,6 +89,7 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
OutDataType, OutDataType,
ElementwiseOperation, ElementwiseOperation,
NumDim, NumDim,
BlockSize,
NPerBlock, NPerBlock,
HPerBlock, HPerBlock,
WPerBlock, WPerBlock,
...@@ -148,19 +151,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -148,19 +151,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
Sequence<NumDim - 1>{}), Sequence<NumDim - 1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
#if 0 return PadTensorDescriptor(
const index_t N = std::accumulate(begin(lengths), std::prev(end(lengths), 2), index_t{1}, std::multiplies<index_t>{}); desc_n_h_w, make_tuple(NPerBlock, HPerBlock, WPerBlock), Sequence<true, true, true>{});
const auto desc_m = transform_tensor_descriptor(
desc_n_h_w,
make_tuple(make_merge_transform(make_tuple(N, H, W))),
make_tuple(Sequence<0, 1, 2>{}),
make_tuple(Sequence<0>{})
);
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
#else
return PadDescriptor_M_1d(desc_n_h_w, gridSize, blockSize);
#endif
} }
using InGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}, 1, 1)); using InGrid1dDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}, 1, 1));
...@@ -171,6 +163,10 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -171,6 +163,10 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
InDataTypePointer, InDataTypePointer,
OutDataTypePointer, OutDataTypePointer,
ElementwiseOperation, ElementwiseOperation,
BlockSize,
NPerBlock,
HPerBlock,
WPerBlock,
MPerThread, MPerThread,
InScalarPerVector, InScalarPerVector,
OutScalarPerVector>; OutScalarPerVector>;
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/data_type.hpp" #include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
...@@ -31,18 +32,40 @@ template <typename InGrid1dDesc, ...@@ -31,18 +32,40 @@ template <typename InGrid1dDesc,
typename InDataTypePointer, typename InDataTypePointer,
typename OutDataTypePointer, typename OutDataTypePointer,
typename ElementwiseOperation, typename ElementwiseOperation,
index_t BlockSize,
index_t NPerBlock,
index_t HPerBlock,
index_t WPerBlock,
index_t MPerThread, index_t MPerThread,
index_t InScalarPerVector, index_t InScalarPerVector,
index_t OutScalarPerVector> index_t OutScalarPerVector>
struct GridwiseCopy struct GridwiseCopy
{ {
static_assert(InGrid1dDesc::GetNumOfDimension() == 3 &&
OutGrid1dDesc::GetNumOfDimension() == 3);
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto thread_buffer_desc_m = static constexpr auto thread_buffer_desc_m =
make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{})); make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
using PassThroughOp = tensor_operation::element_wise::PassThrough; using PassThroughOp = tensor_operation::element_wise::PassThrough;
using ThisThreadBlock = ThisThreadBlock<BlockSize>;
__host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
{
constexpr index_t ABlockLdsExtraM = 0;
// A matrix in LDS memory, dst of blockwise copy
return make_naive_tensor_descriptor(
make_tuple(Number<NPerBlock>{}, Number<HPerBlock>{}, Number<WPerBlock>{}),
make_tuple(Number<NPerBlock + ABlockLdsExtraM>{} * Number<HPerBlock>{},
Number<HPerBlock>{},
I1));
}
__device__ static void Run(const InGrid1dDesc in_grid_1d_desc, __device__ static void Run(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc, const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global, const InDataTypePointer p_in_global,
...@@ -71,6 +94,7 @@ struct GridwiseCopy ...@@ -71,6 +94,7 @@ struct GridwiseCopy
const index_t loop_step = blockPerGrid * blockSize * MPerThread; const index_t loop_step = blockPerGrid * blockSize * MPerThread;
const auto loop_step_index = make_multi_index(loop_step); const auto loop_step_index = make_multi_index(loop_step);
#if 1
auto in_global_load = auto in_global_load =
ThreadwiseTensorSliceTransfer_v2<InDataType, ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType, InDataType,
...@@ -82,7 +106,60 @@ struct GridwiseCopy ...@@ -82,7 +106,60 @@ struct GridwiseCopy
InScalarPerVector, // ScalarPerVector InScalarPerVector, // ScalarPerVector
1, // SrcScalarStrideInVector 1, // SrcScalarStrideInVector
false>{in_grid_1d_desc, thread_global_offset}; false>{in_grid_1d_desc, thread_global_offset};
#else
// const auto block_work_idx =
// block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
// HACK: this force m/n_block_data_idx_on_grid into SGPR
const index_t m_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I0] * NPerBlock * HPerBlock);
// const index_t n_block_data_idx_on_grid =
// __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
// A matrix in LDS memory, dst of blockwise copy
constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
// // B matrix in LDS memory, dst of blockwise copy
// constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
using SliceLengths = Sequence<NPerBlock, HPerBlock, WPerBlock>;
using ABlockTransferThreadClusterLengths_AK0_M_AK1 = Sequence<4, 64, 1>;
using ABlockTransferThreadClusterArrangeOrder = Sequence<1, 0, 2>;
using ABlockTransferSrcAccessOrder = int;
constexpr index_t ABlockTransferSrcVectorDim = 2;
constexpr index_t ABlockTransferSrcScalarPerVector = 1;
constexpr index_t ABlockTransferDstScalarPerVector = 1;
auto in_global_load =
ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
ElementwiseOperation,
ck::tensor_operation::element_wise::PassThrough,
InMemoryDataOperationEnum::Set,
SliceLengths,
ABlockTransferThreadClusterLengths_AK0_M_AK1,
ABlockTransferThreadClusterArrangeOrder,
InDataType,
InDataType,
decltype(in_grid_1d_desc),
decltype(a_block_desc_ak0_m_ak1),
ABlockTransferSrcAccessOrder,
Sequence<1, 0, 2>,
ABlockTransferSrcVectorDim,
2,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector,
1,
1,
true,
true>(
in_grid_1d_desc,
make_multi_index(0, m_block_data_idx_on_grid, 0),
element_op,
a_block_desc_ak0_m_ak1,
make_multi_index(0, 0, 0),
ck::tensor_operation::element_wise::PassThrough{});
#endif
auto out_global_store = auto out_global_store =
ThreadwiseTensorSliceTransfer_v1r3<OutDataType, ThreadwiseTensorSliceTransfer_v1r3<OutDataType,
OutDataType, OutDataType,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment