Commit 7a6dbadc authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Add 'GridwisePermute' kernel

This kernel is a clone of 'GridwiseElementwise_1D'
parent fa21bcde
......@@ -10,7 +10,7 @@
#include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_permute.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/host_utility/kernel_launch.hpp"
......@@ -90,8 +90,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
NumOutput == OutScalarPerVectorSeq::Size(),
"Tuple size is inconsistent with the number of in/out!");
using InDataTypePointerTuple = Tuple<const InDataType*>;
using OutDataTypePointerTuple = Tuple<OutDataType*>;
using InDataTypePointer = const InDataType*;
using OutDataTypePointer = OutDataType*;
template <typename Desc_M>
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
......@@ -147,13 +147,13 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
};
};
using InGrid1dDescTuple = Tuple<decltype(GenerateInOutGrid1dDesc())>;
using OutGrid1dDescTuple = Tuple<decltype(GenerateInOutGrid1dDesc())>;
using InGrid1dDesc = decltype(GenerateInOutGrid1dDesc());
using OutGrid1dDesc = decltype(GenerateInOutGrid1dDesc());
using GridwiseElementwise = GridwiseElementwise_1D<InGrid1dDescTuple,
OutGrid1dDescTuple,
InDataTypePointerTuple,
OutDataTypePointerTuple,
using GridwisePermute = GridwisePermute<InGrid1dDesc,
OutGrid1dDesc,
InDataTypePointer,
OutDataTypePointer,
ElementwiseOperation,
MPerThread,
InScalarPerVectorSeq,
......@@ -170,49 +170,30 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
ElementwiseOperation elementwise_op)
: blockSize_(256),
gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future
in_dev_buffer_(static_cast<InDataTypePointer>(in_dev_buffer)),
out_dev_buffer_(static_cast<OutDataTypePointer>(out_dev_buffer)),
in_grid_1d_desc_(MakeDescriptor_M(inLengths, inStrides, gridSize_, blockSize_)),
out_grid_1d_desc_(MakeDescriptor_M(inLengths, inStrides, gridSize_, blockSize_)),
inLengths_(inLengths),
axes_(axes),
inStridesArray_({inStrides}),
outStridesArray_({outStrides}),
inStrides_(inStrides),
outStrides_(outStrides),
elementwise_op_(elementwise_op)
{
in_dev_buffers_ = generate_tuple(
[&](auto) {
using DataType = InDataType;
return static_cast<const DataType*>(in_dev_buffer);
},
Number<NumInput>{});
out_dev_buffers_ = generate_tuple(
[&](auto) {
using DataType = OutDataType;
return static_cast<DataType*>(out_dev_buffer);
},
Number<NumOutput>{});
in_grid_1d_desc_tuple_ = generate_tuple(
[&](auto) { return MakeDescriptor_M(inLengths, inStrides, gridSize_, blockSize_); },
Number<NumInput>{});
out_grid_1d_desc_tuple_ = generate_tuple(
[&](auto) {
return MakeDescriptor_M(inLengths, outStrides, gridSize_, blockSize_);
},
Number<NumOutput>{});
}
index_t blockSize_;
index_t gridSize_;
InDataTypePointerTuple in_dev_buffers_;
OutDataTypePointerTuple out_dev_buffers_;
InGrid1dDescTuple in_grid_1d_desc_tuple_;
OutGrid1dDescTuple out_grid_1d_desc_tuple_;
InDataTypePointer in_dev_buffer_;
OutDataTypePointer out_dev_buffer_;
InGrid1dDesc in_grid_1d_desc_;
OutGrid1dDesc out_grid_1d_desc_;
std::array<index_t, NumDim> inLengths_;
std::array<index_t, NumDim> axes_;
std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
std::array<index_t, NumDim> inStrides_;
std::array<index_t, NumDim> outStrides_;
ElementwiseOperation elementwise_op_;
};
......@@ -221,11 +202,11 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
{
static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto kernel = kernel_elementwise_1d<GridwiseElementwise,
InGrid1dDescTuple,
OutGrid1dDescTuple,
InDataTypePointerTuple,
OutDataTypePointerTuple,
const auto kernel = kernel_permute<GridwisePermute,
InGrid1dDesc,
OutGrid1dDesc,
InDataTypePointer,
OutDataTypePointer,
ElementwiseOperation>;
float elapsed_time = launch_and_time_kernel(stream_config,
......@@ -233,10 +214,10 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
dim3(arg.gridSize_),
dim3(arg.blockSize_),
0,
arg.in_grid_1d_desc_tuple_,
arg.out_grid_1d_desc_tuple_,
arg.in_dev_buffers_,
arg.out_dev_buffers_,
arg.in_grid_1d_desc_,
arg.out_grid_1d_desc_,
arg.in_dev_buffer_,
arg.out_dev_buffer_,
arg.elementwise_op_);
return elapsed_time;
}
......@@ -262,17 +243,15 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
};
bool valid = true;
static_for<0, NumInput, 1>{}([&](auto I) {
if(!IsScalarPerVectorValid(
arg.inLengths_, arg.inStridesArray_[I.value], InScalarPerVectorSeq::At(I)))
if(!IsScalarPerVectorValid(arg.inLengths_, arg.inStrides_, InScalarPerVectorSeq::At(0)))
{
valid = false;
});
}
static_for<0, NumOutput, 1>{}([&](auto I) {
if(!IsScalarPerVectorValid(
arg.inLengths_, arg.outStridesArray_[I.value], OutScalarPerVectorSeq::At(I)))
if(!IsScalarPerVectorValid(arg.inLengths_, arg.outStrides_, OutScalarPerVectorSeq::At(0)))
{
valid = false;
});
}
return valid;
};
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck {
template <typename GridwisePermuteFunctor,
typename InGrid1dDesc,
typename OutGrid1dDesc,
typename InDataTypePointer,
typename OutDataTypePointer,
typename ElementwiseOperation>
__global__ void kernel_permute(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global,
const ElementwiseOperation elementwise_op)
{
GridwisePermuteFunctor::Run(
in_grid_1d_desc, out_grid_1d_desc, p_in_global, p_out_global, elementwise_op);
}
template <typename InGrid1dDesc,
typename OutGrid1dDesc,
typename InDataTypePointer,
typename OutDataTypePointer,
typename ElementwiseOperation,
index_t MPerThread,
typename InScalarPerVectorSeq,
typename OutScalarPerVectorSeq>
struct GridwisePermute
{
static constexpr index_t NumInput = 1;
static constexpr index_t NumOutput = 1;
static_assert(NumInput == InScalarPerVectorSeq::Size() &&
NumOutput == OutScalarPerVectorSeq::Size(),
"Tuple size is inconsistent with the number of in/out!");
static constexpr auto I0 = Number<0>{};
static constexpr auto thread_buffer_desc_m =
make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
using PassThroughOp = tensor_operation::element_wise::PassThrough;
__device__ static void Run(const InGrid1dDesc in_grid_1d_desc,
const OutGrid1dDesc out_grid_1d_desc,
const InDataTypePointer p_in_global,
const OutDataTypePointer p_out_global,
const ElementwiseOperation elementwise_op)
{
const index_t thread_global_id = get_thread_global_1d_id();
using InDataType = remove_cv_t<remove_pointer_t<InDataTypePointer>>;
auto in_thread_buf = StaticBuffer<AddressSpaceEnum::Vgpr, InDataType, MPerThread, true>{};
using OutDataType = remove_cv_t<remove_pointer_t<OutDataTypePointer>>;
auto out_thread_buf = StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MPerThread, true>{};
auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global, in_grid_1d_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global, out_grid_1d_desc.GetElementSpaceSize());
const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread);
const index_t blockSize = get_block_size();
const index_t blockPerGrid = get_grid_size();
const auto M = in_grid_1d_desc.GetLength(I0);
const index_t loop_step = blockPerGrid * blockSize * MPerThread;
const auto loop_step_index = make_multi_index(loop_step);
auto in_global_load =
ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType,
decltype(in_grid_1d_desc),
decltype(thread_buffer_desc_m),
Sequence<MPerThread>, // SliceLengths
Sequence<0>, // DimAccessOrder
0, // SrcVectorDim
InScalarPerVectorSeq::At(0), // ScalarPerVector
1, // SrcScalarStrideInVector
false>{in_grid_1d_desc, thread_global_offset};
auto out_global_store =
ThreadwiseTensorSliceTransfer_v1r3<OutDataType,
OutDataType,
decltype(thread_buffer_desc_m),
decltype(out_grid_1d_desc),
PassThroughOp,
Sequence<MPerThread>, // SliceLengths
Sequence<0>, // DimAccessOrder
0, // SrcVectorDim
OutScalarPerVectorSeq::At(0),
InMemoryDataOperationEnum::Set,
1,
false>(
out_grid_1d_desc, thread_global_offset, PassThroughOp{});
index_t num_iter = M / (loop_step);
do
{
in_global_load.Run(in_grid_1d_desc,
in_global_buf,
thread_buffer_desc_m,
make_tuple(I0),
in_thread_buf);
in_global_load.MoveSrcSliceWindow(in_grid_1d_desc, loop_step_index);
static_for<0, MPerThread, 1>{}([&](auto iM) {
// get reference to in data
const auto& in_data_ref = in_thread_buf(iM);
// get reference to dst data
auto& out_data_ref = out_thread_buf(iM);
elementwise_op(out_data_ref, in_data_ref);
});
out_global_store.Run(thread_buffer_desc_m,
make_tuple(I0),
out_thread_buf,
out_grid_1d_desc,
out_global_buf);
out_global_store.MoveDstSliceWindow(out_grid_1d_desc, loop_step_index);
} while(--num_iter);
}
};
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment