"...composable_kernel.git" did not exist on "07d327a279db0e03b4c869850ffa21ad68c89d6a"
Commit 7a6dbadc authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Add 'GridwisePermute' kernel

This kernel is a clone of 'GridwiseElementwise_1D'
parent fa21bcde
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include "ck/utility/math.hpp" #include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp" #include "ck/utility/sequence.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_permute.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/kernel_launch.hpp"
...@@ -90,8 +90,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -90,8 +90,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
NumOutput == OutScalarPerVectorSeq::Size(), NumOutput == OutScalarPerVectorSeq::Size(),
"Tuple size is inconsistent with the number of in/out!"); "Tuple size is inconsistent with the number of in/out!");
using InDataTypePointerTuple = Tuple<const InDataType*>; using InDataTypePointer = const InDataType*;
using OutDataTypePointerTuple = Tuple<OutDataType*>; using OutDataTypePointer = OutDataType*;
template <typename Desc_M> template <typename Desc_M>
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize) static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
...@@ -147,13 +147,13 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -147,13 +147,13 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
}; };
}; };
using InGrid1dDescTuple = Tuple<decltype(GenerateInOutGrid1dDesc())>; using InGrid1dDesc = decltype(GenerateInOutGrid1dDesc());
using OutGrid1dDescTuple = Tuple<decltype(GenerateInOutGrid1dDesc())>; using OutGrid1dDesc = decltype(GenerateInOutGrid1dDesc());
using GridwiseElementwise = GridwiseElementwise_1D<InGrid1dDescTuple, using GridwisePermute = GridwisePermute<InGrid1dDesc,
OutGrid1dDescTuple, OutGrid1dDesc,
InDataTypePointerTuple, InDataTypePointer,
OutDataTypePointerTuple, OutDataTypePointer,
ElementwiseOperation, ElementwiseOperation,
MPerThread, MPerThread,
InScalarPerVectorSeq, InScalarPerVectorSeq,
...@@ -170,49 +170,30 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -170,49 +170,30 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
ElementwiseOperation elementwise_op) ElementwiseOperation elementwise_op)
: blockSize_(256), : blockSize_(256),
gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future
in_dev_buffer_(static_cast<InDataTypePointer>(in_dev_buffer)),
out_dev_buffer_(static_cast<OutDataTypePointer>(out_dev_buffer)),
in_grid_1d_desc_(MakeDescriptor_M(inLengths, inStrides, gridSize_, blockSize_)),
out_grid_1d_desc_(MakeDescriptor_M(inLengths, inStrides, gridSize_, blockSize_)),
inLengths_(inLengths), inLengths_(inLengths),
axes_(axes), axes_(axes),
inStridesArray_({inStrides}), inStrides_(inStrides),
outStridesArray_({outStrides}), outStrides_(outStrides),
elementwise_op_(elementwise_op) elementwise_op_(elementwise_op)
{ {
in_dev_buffers_ = generate_tuple(
[&](auto) {
using DataType = InDataType;
return static_cast<const DataType*>(in_dev_buffer);
},
Number<NumInput>{});
out_dev_buffers_ = generate_tuple(
[&](auto) {
using DataType = OutDataType;
return static_cast<DataType*>(out_dev_buffer);
},
Number<NumOutput>{});
in_grid_1d_desc_tuple_ = generate_tuple(
[&](auto) { return MakeDescriptor_M(inLengths, inStrides, gridSize_, blockSize_); },
Number<NumInput>{});
out_grid_1d_desc_tuple_ = generate_tuple(
[&](auto) {
return MakeDescriptor_M(inLengths, outStrides, gridSize_, blockSize_);
},
Number<NumOutput>{});
} }
index_t blockSize_; index_t blockSize_;
index_t gridSize_; index_t gridSize_;
InDataTypePointerTuple in_dev_buffers_; InDataTypePointer in_dev_buffer_;
OutDataTypePointerTuple out_dev_buffers_; OutDataTypePointer out_dev_buffer_;
InGrid1dDescTuple in_grid_1d_desc_tuple_; InGrid1dDesc in_grid_1d_desc_;
OutGrid1dDescTuple out_grid_1d_desc_tuple_; OutGrid1dDesc out_grid_1d_desc_;
std::array<index_t, NumDim> inLengths_; std::array<index_t, NumDim> inLengths_;
std::array<index_t, NumDim> axes_; std::array<index_t, NumDim> axes_;
std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_; std::array<index_t, NumDim> inStrides_;
std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_; std::array<index_t, NumDim> outStrides_;
ElementwiseOperation elementwise_op_; ElementwiseOperation elementwise_op_;
}; };
...@@ -221,11 +202,11 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -221,11 +202,11 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
{ {
static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{ {
const auto kernel = kernel_elementwise_1d<GridwiseElementwise, const auto kernel = kernel_permute<GridwisePermute,
InGrid1dDescTuple, InGrid1dDesc,
OutGrid1dDescTuple, OutGrid1dDesc,
InDataTypePointerTuple, InDataTypePointer,
OutDataTypePointerTuple, OutDataTypePointer,
ElementwiseOperation>; ElementwiseOperation>;
float elapsed_time = launch_and_time_kernel(stream_config, float elapsed_time = launch_and_time_kernel(stream_config,
...@@ -233,10 +214,10 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -233,10 +214,10 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
dim3(arg.gridSize_), dim3(arg.gridSize_),
dim3(arg.blockSize_), dim3(arg.blockSize_),
0, 0,
arg.in_grid_1d_desc_tuple_, arg.in_grid_1d_desc_,
arg.out_grid_1d_desc_tuple_, arg.out_grid_1d_desc_,
arg.in_dev_buffers_, arg.in_dev_buffer_,
arg.out_dev_buffers_, arg.out_dev_buffer_,
arg.elementwise_op_); arg.elementwise_op_);
return elapsed_time; return elapsed_time;
} }
...@@ -262,17 +243,15 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType, ...@@ -262,17 +243,15 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
}; };
bool valid = true; bool valid = true;
static_for<0, NumInput, 1>{}([&](auto I) { if(!IsScalarPerVectorValid(arg.inLengths_, arg.inStrides_, InScalarPerVectorSeq::At(0)))
if(!IsScalarPerVectorValid( {
arg.inLengths_, arg.inStridesArray_[I.value], InScalarPerVectorSeq::At(I)))
valid = false; valid = false;
}); }
static_for<0, NumOutput, 1>{}([&](auto I) { if(!IsScalarPerVectorValid(arg.inLengths_, arg.outStrides_, OutScalarPerVectorSeq::At(0)))
if(!IsScalarPerVectorValid( {
arg.inLengths_, arg.outStridesArray_[I.value], OutScalarPerVectorSeq::At(I)))
valid = false; valid = false;
}); }
return valid; return valid;
}; };
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck {
template <typename GridwisePermuteFunctor,
typename InGrid1dDesc,
typename OutGrid1dDesc,
typename InDataTypePointer,
typename OutDataTypePointer,
typename ElementwiseOperation>
__global__ void kernel_permute(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global,
const ElementwiseOperation elementwise_op)
{
GridwisePermuteFunctor::Run(
in_grid_1d_desc, out_grid_1d_desc, p_in_global, p_out_global, elementwise_op);
}
template <typename InGrid1dDesc,
typename OutGrid1dDesc,
typename InDataTypePointer,
typename OutDataTypePointer,
typename ElementwiseOperation,
index_t MPerThread,
typename InScalarPerVectorSeq,
typename OutScalarPerVectorSeq>
struct GridwisePermute
{
static constexpr index_t NumInput = 1;
static constexpr index_t NumOutput = 1;
static_assert(NumInput == InScalarPerVectorSeq::Size() &&
NumOutput == OutScalarPerVectorSeq::Size(),
"Tuple size is inconsistent with the number of in/out!");
static constexpr auto I0 = Number<0>{};
static constexpr auto thread_buffer_desc_m =
make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
using PassThroughOp = tensor_operation::element_wise::PassThrough;
__device__ static void Run(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global,
const ElementwiseOperation elementwise_op)
{
const index_t thread_global_id = get_thread_global_1d_id();
using InDataType = remove_cv_t<remove_pointer_t<InDataTypePointer>>;
auto in_thread_buf = StaticBuffer<AddressSpaceEnum::Vgpr, InDataType, MPerThread, true>{};
using OutDataType = remove_cv_t<remove_pointer_t<OutDataTypePointer>>;
auto out_thread_buf = StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MPerThread, true>{};
auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global, in_grid_1d_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global, out_grid_1d_desc.GetElementSpaceSize());
const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread);
const index_t blockSize = get_block_size();
const index_t blockPerGrid = get_grid_size();
const auto M = in_grid_1d_desc.GetLength(I0);
const index_t loop_step = blockPerGrid * blockSize * MPerThread;
const auto loop_step_index = make_multi_index(loop_step);
auto in_global_load =
ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType,
decltype(in_grid_1d_desc),
decltype(thread_buffer_desc_m),
Sequence<MPerThread>, // SliceLengths
Sequence<0>, // DimAccessOrder
0, // SrcVectorDim
InScalarPerVectorSeq::At(0), // ScalarPerVector
1, // SrcScalarStrideInVector
false>{in_grid_1d_desc, thread_global_offset};
auto out_global_store =
ThreadwiseTensorSliceTransfer_v1r3<OutDataType,
OutDataType,
decltype(thread_buffer_desc_m),
decltype(out_grid_1d_desc),
PassThroughOp,
Sequence<MPerThread>, // SliceLengths
Sequence<0>, // DimAccessOrder
0, // SrcVectorDim
OutScalarPerVectorSeq::At(0),
InMemoryDataOperationEnum::Set,
1,
false>(
out_grid_1d_desc, thread_global_offset, PassThroughOp{});
index_t num_iter = M / (loop_step);
do
{
in_global_load.Run(in_grid_1d_desc,
in_global_buf,
thread_buffer_desc_m,
make_tuple(I0),
in_thread_buf);
in_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index);
static_for<0, MPerThread, 1>{}([&](auto iM) {
// get reference to in data
const auto& in_data_ref = in_thread_buf(iM);
// get reference to dst data
auto& out_data_ref = out_thread_buf(iM);
elementwise_op(out_data_ref, in_data_ref);
});
out_global_store.Run(thread_buffer_desc_m,
make_tuple(I0),
out_thread_buf,
out_grid_1d_desc,
out_global_buf);
out_global_store.MoveDstSliceWindow(out_grid_1d_desc, loop_step_index);
} while(--num_iter);
}
};
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment