Commit 9dce6851 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge develop

parents 3cc57101 5d37d7bf
#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename InDataType,
typename AccDataType,
typename OutDataType,
int Rank,
typename ReduceDims,
typename ReduceOperation,
typename InElementwiseOperation,
typename AccElementwiseOperation,
bool PropagateNan,
bool NeedIndices,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t InSrcVectorDim,
index_t InSrcVectorSize,
index_t OutDstVectorSize>
struct DeviceReduceMultiBlockPartialReduce
: public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
{
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
"Invalid thread cluster size assignments!");
static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");
using IndexDataType = int32_t;
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
static constexpr index_t srcDims = Rank;
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths) override
{
size_t invariant_total_length;
size_t reduce_total_length;
std::tie(invariant_total_length, reduce_total_length) =
get_2d_lengths<Rank, ReduceDims>(inLengths);
int iterations = 1;
while(true)
{
int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
(K_BlockTileSize * iterations);
// we want the blkGroupSize be not more than 128
if(testBlkGroupSize <= 128)
break;
iterations++;
};
int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
(K_BlockTileSize * iterations);
size_t workspace_size = invariant_total_length * blkGroupSize;
size_t wsSizeInBytes =
!NeedIndices ? workspace_size * sizeof(AccDataType)
: workspace_size * (sizeof(AccDataType) + sizeof(int)) + 64 + sizeof(int);
return (wsSizeInBytes);
};
bool HasFurtherCall() override { return (true); };
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
const std::vector<int>& inStrides,
int blkGroupSize,
int kBlockTileIterations)
{
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto in_grid_desc_m_k = [&]() {
if constexpr(reduceAllDims)
{
const auto one_dim_inDesc = transform_tensor_descriptor(
inDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
return transform_tensor_descriptor(one_dim_inDesc,
make_tuple(make_unmerge_transform(make_tuple(
1, one_dim_inDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
}
else
{
const auto toReduceDimLengths =
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
return transform_tensor_descriptor(
inDesc,
make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)),
make_tuple(InvariantDims{}, ReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
}
}();
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen;
auto in_grid_desc_m_k_padded =
transform_tensor_descriptor(in_grid_desc_m_k,
make_tuple(make_right_pad_transform(outerLen, inPad_M),
make_right_pad_transform(innerLen, inPad_K)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return (in_grid_desc_m_k_padded);
};
static auto MakeWorkspace2dDescriptor(int outerLen, int blkGroupSize)
{
auto ws_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(outerLen, blkGroupSize));
const auto wsPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
auto ws_desc_m_k_padded =
transform_tensor_descriptor(ws_desc_m_k,
make_tuple(make_right_pad_transform(outerLen, wsPad),
make_pass_through_transform(blkGroupSize)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return (ws_desc_m_k_padded);
};
struct Argument : public BaseArgument
{
Argument(const std::vector<index_t>& inLengths,
const std::vector<index_t>& inStrides,
const std::vector<index_t>& outLengths,
const std::vector<index_t>& outStrides,
float alpha,
float beta,
const InDataType* in_dev,
OutDataType* out_dev,
IndexDataType* out_indices_dev,
AccDataType* workspace_dev,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op)
: in_dev_{in_dev},
out_dev_{out_dev},
out_indices_dev_{out_indices_dev},
workspace_dev_{workspace_dev}
{
inLengths_ = inLengths;
inStrides_ = inStrides;
outLengths_ = outLengths;
outStrides_ = outStrides;
in_elementwise_op_ = in_elementwise_op;
acc_elementwise_op_ = acc_elementwise_op;
alpha_ = static_cast<AccDataType>(alpha);
beta_ = static_cast<OutDataType>(beta);
std::tie(invariant_total_length, reduce_total_length) =
get_2d_lengths<Rank, ReduceDims>(inLengths);
if constexpr(InvariantDims::Size() == 0)
invariant_lowest_length = 1;
else
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
int iterations = 1;
while(true)
{
int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
(K_BlockTileSize * iterations);
// we want the blkGroupSize be not more than 128
if(testBlkGroupSize <= 128)
break;
iterations++;
};
blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
(K_BlockTileSize * iterations);
kBlockTileIterations = iterations;
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
M_BlockTileSize * blkGroupSize;
size_t ws_buf2_bytes_offset = math::integer_least_multiple(
invariant_total_length * blkGroupSize * sizeof(AccDataType), 64);
if constexpr(NeedIndices)
workspace_indices_dev_ = reinterpret_cast<int*>(
reinterpret_cast<char*>(workspace_dev_) + ws_buf2_bytes_offset);
else
workspace_indices_dev_ = nullptr;
}
std::vector<int> inLengths_;
std::vector<int> inStrides_;
std::vector<int> outLengths_;
std::vector<int> outStrides_;
AccDataType alpha_;
OutDataType beta_;
const InDataType* in_dev_;
OutDataType* out_dev_;
IndexDataType* out_indices_dev_;
AccDataType* workspace_dev_;
IndexDataType* workspace_indices_dev_;
InElementwiseOperation in_elementwise_op_;
AccElementwiseOperation acc_elementwise_op_;
int invariant_lowest_length;
int reduce_lowest_length;
size_t invariant_total_length;
size_t reduce_total_length;
index_t blkGroupSize;
index_t kBlockTileIterations;
size_t gridSize;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, int nrepeat = 1)
{
const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
const auto ws_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeWorkspace2dDescriptor(
arg.invariant_total_length, arg.blkGroupSize);
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
using WorkspaceDesc_M_K = decltype(ws_desc_m_k);
using GridwiseReduce =
GridwiseReduction_mk_to_mk_multiblock_partial_reduce<InDataType,
AccDataType,
IndexDataType,
InGridDesc_M_K,
WorkspaceDesc_M_K,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
InSrcVectorDim,
InSrcVectorSize,
OutDstVectorSize>;
float avg_time = 0;
const auto kernel = kernel_partial_reduce_multiblock<GridwiseReduce,
NeedIndices,
InDataType,
AccDataType,
IndexDataType,
InGridDesc_M_K,
WorkspaceDesc_M_K,
InElementwiseOperation,
AccElementwiseOperation>;
avg_time = launch_and_time_kernel(kernel,
nrepeat,
dim3(arg.gridSize),
dim3(BlockSize),
0,
in_grid_desc_m_k,
ws_desc_m_k,
arg.in_elementwise_op_,
arg.acc_elementwise_op_,
arg.blkGroupSize,
arg.kBlockTileIterations,
arg.in_dev_,
arg.workspace_dev_,
arg.workspace_indices_dev_);
return (avg_time);
};
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
};
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
if constexpr(OutDstVectorSize != 1)
return (false);
if constexpr(InSrcVectorDim == 0)
{
if constexpr(InvariantDims::Size() == 0)
return (false);
if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
return (false);
if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
return (false);
}
else
{
if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
return (false);
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
return (false);
};
// cases with small reduce_total_length should be handled by the BlockWise method
if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize)
return (false);
return (true);
};
std::vector<int> GetWorkspace2dLengths(const BaseArgument* p_arg) override
{
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
return (
std::vector<int>{static_cast<int>(pArg->invariant_total_length), pArg->blkGroupSize});
};
std::unique_ptr<BaseArgument>
MakeArgumentPointer(const std::vector<int>& inLengths,
const std::vector<int>& inStrides,
const std::vector<int>& outLengths,
const std::vector<int>& outStrides,
float alpha,
float beta,
const void* in_dev,
void* out_dev,
void* out_indices_dev,
void* workspace_dev,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op) override
{
return std::make_unique<Argument>(inLengths,
inStrides,
outLengths,
outStrides,
alpha,
beta,
static_cast<const InDataType*>(in_dev),
static_cast<OutDataType*>(out_dev),
static_cast<IndexDataType*>(out_indices_dev),
static_cast<AccDataType*>(workspace_dev),
in_elementwise_op,
acc_elementwise_op);
};
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>();
};
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "DeviceReduceMultiBlockPartialReduce<" << BlockSize << ",";
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
// clang-format on
return str.str();
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_REDUCE_THREADWISE_HPP
#define DEVICE_REDUCE_THREADWISE_HPP
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename InDataType,
typename AccDataType,
typename OutDataType,
index_t Rank,
typename ReduceDims,
typename ReduceOperation,
typename InElementwiseOperation,
typename OutElementwiseOperation,
bool PropagateNan,
bool NeedIndices,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t InSrcVectorDim,
index_t InSrcVectorSize,
index_t OutDstVectorSize>
struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutElementwiseOperation>
{
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1),
"Threadwise can only be called with KThreadClusterSize be 1 !");
using IndexDataType = int32_t;
static constexpr bool BetaIsZero = NeedIndices;
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
static constexpr index_t srcDims = Rank;
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
const std::vector<int>& inStrides)
{
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto in_grid_desc_m_k = [&]() {
if constexpr(reduceAllDims)
{
const auto one_dim_inDesc = transform_tensor_descriptor(
inDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
return transform_tensor_descriptor(one_dim_inDesc,
make_tuple(make_unmerge_transform(make_tuple(
1, one_dim_inDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
}
else
{
const auto toReduceDimLengths =
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
return transform_tensor_descriptor(
inDesc,
make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)),
make_tuple(InvariantDims{}, ReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
}
}();
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
auto in_grid_desc_m_k_padded =
transform_tensor_descriptor(in_grid_desc_m_k,
make_tuple(make_right_pad_transform(outerLen, inPad_M),
make_right_pad_transform(innerLen, inPad_K)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return (in_grid_desc_m_k_padded);
};
static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
const std::vector<int>& outStrides)
{
const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
auto out_grid_desc_m = transform_tensor_descriptor(
outDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
auto out_grid_desc_m_padded =
transform_tensor_descriptor(out_grid_desc_m,
make_tuple(make_right_pad_transform(outerLen, outPad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
return (out_grid_desc_m_padded);
};
struct Argument : public BaseArgument
{
Argument(const std::vector<int>& inLengths,
const std::vector<int>& inStrides,
const std::vector<int>& outLengths,
const std::vector<int>& outStrides,
float alpha,
float beta,
const InDataType* in_dev,
OutDataType* out_dev,
IndexDataType* out_indices_dev,
AccDataType* workspace_dev,
const InElementwiseOperation& in_elementwise_op,
const OutElementwiseOperation& acc_elementwise_op)
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev}
{
(void)workspace_dev;
inLengths_ = inLengths;
inStrides_ = inStrides;
outLengths_ = outLengths;
outStrides_ = outStrides;
in_elementwise_op_ = in_elementwise_op;
acc_elementwise_op_ = acc_elementwise_op;
alpha_ = static_cast<AccDataType>(alpha);
beta_ = static_cast<OutDataType>(beta);
std::tie(invariant_total_length, reduce_total_length) =
get_2d_lengths<Rank, ReduceDims>(inLengths);
if constexpr(InvariantDims::Size() == 0)
invariant_lowest_length = 1;
else
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
M_BlockTileSize;
}
std::vector<int> inLengths_;
std::vector<int> inStrides_;
std::vector<int> outLengths_;
std::vector<int> outStrides_;
AccDataType alpha_;
OutDataType beta_;
const InDataType* in_dev_;
OutDataType* out_dev_;
IndexDataType* out_indices_dev_;
InElementwiseOperation in_elementwise_op_;
OutElementwiseOperation acc_elementwise_op_;
int invariant_lowest_length;
int reduce_lowest_length;
size_t invariant_total_length;
size_t reduce_total_length;
size_t gridSize;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, int nrepeat = 1)
{
const auto in_grid_desc_m_k =
DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
const auto out_grid_desc_m =
DeviceReduceThreadWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
using OutGridDesc_M = decltype(out_grid_desc_m);
using GridwiseReduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
OutDataType,
AccDataType,
IndexDataType,
InGridDesc_M_K,
OutGridDesc_M,
ReduceOperation,
InElementwiseOperation,
OutElementwiseOperation,
PropagateNan,
BetaIsZero,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
InSrcVectorDim,
InSrcVectorSize,
OutDstVectorSize>;
float avg_time = 0;
const auto kernel = kernel_reduce_threadwise<GridwiseReduce,
NeedIndices,
InDataType,
OutDataType,
AccDataType,
IndexDataType,
InGridDesc_M_K,
OutGridDesc_M,
InElementwiseOperation,
OutElementwiseOperation>;
avg_time = launch_and_time_kernel(kernel,
nrepeat,
dim3(arg.gridSize),
dim3(BlockSize),
0,
in_grid_desc_m_k,
out_grid_desc_m,
arg.in_elementwise_op_,
arg.acc_elementwise_op_,
arg.alpha_,
arg.in_dev_,
arg.beta_,
arg.out_dev_,
arg.out_indices_dev_);
return (avg_time);
};
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
};
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
if constexpr(InSrcVectorDim == 0)
{
if constexpr(InvariantDims::Size() == 0)
return (false);
if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
return (false);
if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
return (false);
}
else
{
if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
return (false);
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
return (false);
};
// To improve
if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
return (false);
// TODO: remove this. Should return true, as long as this DeviceOP instance support this
// case for bigger reduce_total_length size, we are supposed to use BlockWise method for
// better performance
if(pArg->reduce_total_length / KThreadSliceSize >= 32)
return (false);
return (true);
};
std::unique_ptr<BaseArgument>
MakeArgumentPointer(const std::vector<int>& inLengths,
const std::vector<int>& inStrides,
const std::vector<int>& outLengths,
const std::vector<int>& outStrides,
float alpha,
float beta,
const void* in_dev,
void* out_dev,
void* out_indices_dev,
void* workspace_dev,
const InElementwiseOperation& in_elementwise_op,
const OutElementwiseOperation& acc_elementwise_op) override
{
return std::make_unique<Argument>(inLengths,
inStrides,
outLengths,
outStrides,
alpha,
beta,
static_cast<const InDataType*>(in_dev),
static_cast<OutDataType*>(out_dev),
static_cast<IndexDataType*>(out_indices_dev),
static_cast<AccDataType*>(workspace_dev),
in_elementwise_op,
acc_elementwise_op);
};
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>();
};
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "DeviceReducceThreadWise<" << BlockSize << ",";
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
// clang-format on
return str.str();
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_OPERATOR_MAPPING_HPP
#define CK_REDUCTION_OPERATOR_MAPPING_HPP
#include "reduction_operator.hpp"
#include "reduction_enums.hpp"
#include "element_wise_operation.hpp"
namespace ck {
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
// respective functor classes.
// The boolean member "indexable" are also provided in reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template <typename T, ReduceTensorOp_t Op>
struct reduce_binary_operator;
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = false;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
{
using opType = reduce::Mul<T>;
using dataType = T;
static constexpr bool indexable = false;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
{
using opType = reduce::Min<T>;
using dataType = T;
static constexpr bool indexable = true;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
{
using opType = reduce::Max<T>;
using dataType = T;
static constexpr bool indexable = true;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
{
using opType = reduce::AMax<T>;
using dataType = T;
static constexpr bool indexable = true;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = false;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = false;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
{
using opType = reduce::Add<T>;
using dataType = T;
static constexpr bool indexable = false;
};
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template <typename T, ReduceTensorOp_t Op, bool IsFirstReduce, bool IsLastReduce>
struct reduce_unary_operator
{
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
};
template <typename T, bool IsFirstReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, IsFirstReduce, true>
{
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T, true>;
};
template <typename T, bool IsLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, IsLastReduce>
{
using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs<T, T>;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
};
template <typename T, bool IsLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, IsLastReduce>
{
using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs<T, T>;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
{
using InElementwiseOperation = tensor_operation::element_wise::UnarySquare<T, T>;
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
{
using InElementwiseOperation = tensor_operation::element_wise::UnarySquare<T, T>;
using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt<T, T>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
{
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt<T, T>;
};
} // end of namespace ck
#endif
......@@ -87,14 +87,17 @@ struct NKHW : public BaseTensorLayout
struct NDHWC : public BaseTensorLayout
{
static constexpr const char* name = "NDHWC";
};
struct KZYXC : public BaseTensorLayout
{
static constexpr const char* name = "KZYXC";
};
struct NDHWK : public BaseTensorLayout
{
static constexpr const char* name = "NDHWK";
};
} // namespace convolution
......
#ifndef CK_ELEMENT_WISE_OPERATION_HPP
#define CK_ELEMENT_WISE_OPERATION_HPP
#include "data_type.hpp"
#include "data_type.hpp"
......@@ -13,7 +14,7 @@ struct PassThrough
__host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; }
__host__ __device__ void operator()(ushort& y, const ushort& x) const { y = x; }
__host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const { y = x; }
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }
......@@ -143,6 +144,192 @@ struct AddHardswishAdd
}
};
struct RequantReluRequant
{
// FIXME: We just need one scale for Relu / Leaky Relu / PRelu
RequantReluRequant(float scaleGemm, float scaleRelu)
: scaleGemm_(scaleGemm), scaleRelu_(scaleRelu)
{
}
__host__ __device__ constexpr void operator()(int8_t& y, const int& x) const
{
float gemm_requant = scaleGemm_ * static_cast<float>(x);
float relu = gemm_requant > 0 ? gemm_requant : 0;
float relu_requant = scaleRelu_ * relu;
y = static_cast<int8_t>(relu_requant > 127 ? 127
: relu_requant < -128 ? -128 : relu_requant);
}
// for reference_gemm
__host__ __device__ constexpr void operator()(float& y, const float& x) const
{
float gemm_requant = scaleGemm_ * x;
float relu = gemm_requant > 0 ? gemm_requant : 0;
float relu_requant = scaleRelu_ * relu;
y = static_cast<float>(relu_requant > 127 ? 127
: relu_requant < -128 ? -128 : relu_requant);
}
float scaleGemm_;
float scaleRelu_;
};
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template <typename Y, typename X, bool HasDividing = false>
struct UnaryIdentic;
template <>
struct UnaryIdentic<float, float, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = x; };
};
template <>
struct UnaryIdentic<float, float, true>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(float& y, const float& x) const
{
y = x / type_convert<float>(divider_);
};
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<half_t, half_t, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; };
};
template <>
struct UnaryIdentic<double, double, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(double& y, const double& x) const { y = x; };
};
template <>
struct UnaryIdentic<double, double, true>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(double& y, const double& x) const
{
y = x / type_convert<double>(divider_);
};
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<int32_t, int32_t, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; };
};
template <typename Y, typename X, bool HasDividing = false>
struct UnarySquare;
template <>
struct UnarySquare<float, float, false>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = x * x; };
};
template <>
struct UnarySquare<float, float, true>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(float& y, const float& x) const
{
y = x * x / type_convert<float>(divider_);
};
int32_t divider_ = 1;
};
template <>
struct UnarySquare<double, double, false>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(double& y, const double& x) const { y = x * x; };
};
template <>
struct UnarySquare<double, double, true>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(double& y, const double& x) const
{
y = x * x / type_convert<double>(divider_);
};
int32_t divider_ = 1;
};
template <typename Y, typename X>
struct UnaryAbs;
template <>
struct UnaryAbs<float, float>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = abs(x); };
};
template <>
struct UnaryAbs<half_t, half_t>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(half_t& y, const half_t& x) const { y = __habs(x); };
};
template <>
struct UnaryAbs<double, double>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); };
};
template <typename Y, typename X>
struct UnarySqrt;
template <>
struct UnarySqrt<float, float>
{
__host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = sqrtf(x); };
};
template <>
struct UnarySqrt<double, double>
{
__host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(double& y, const double& x) const { y = sqrt(x); };
};
} // namespace element_wise
} // namespace tensor_operation
} // namespace ck
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP
#define CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <typename GridwiseReduction,
bool NeedIndices,
typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename InElementwiseOperation,
typename OutElementwiseOperation>
__global__ void kernel_reduce_blockwise(const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const OutElementwiseOperation acc_elementwise_op,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType beta,
OutDataType* const __restrict__ p_out_global,
const IndexDataType* const __restrict__ p_ws_indices_global,
IndexDataType* const __restrict__ p_indices_global)
{
if constexpr(!NeedIndices)
{
GridwiseReduction::Run(in_grid_desc_m_k,
out_grid_desc_m,
in_elementwise_op,
acc_elementwise_op,
alpha,
p_in_global,
beta,
p_out_global,
p_ws_indices_global,
p_indices_global);
}
else
{
GridwiseReduction::RunWithIndex(in_grid_desc_m_k,
out_grid_desc_m,
in_elementwise_op,
acc_elementwise_op,
alpha,
p_in_global,
beta,
p_out_global,
p_ws_indices_global,
p_indices_global);
};
};
template <typename GridwiseReduction,
bool NeedIndices,
typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename InElementwiseOperation,
typename OutElementwiseOperation>
__global__ void
kernel_reduce_blockwise_second_call(const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const OutElementwiseOperation acc_elementwise_op,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType beta,
OutDataType* const __restrict__ p_out_global,
const IndexDataType* const __restrict__ p_ws_indices_global,
IndexDataType* const __restrict__ p_indices_global)
{
if constexpr(!NeedIndices)
{
GridwiseReduction::Run(in_grid_desc_m_k,
out_grid_desc_m,
in_elementwise_op,
acc_elementwise_op,
alpha,
p_in_global,
beta,
p_out_global,
p_ws_indices_global,
p_indices_global);
}
else
{
GridwiseReduction::RunSecondCallWithIndex(in_grid_desc_m_k,
out_grid_desc_m,
in_elementwise_op,
acc_elementwise_op,
alpha,
p_in_global,
beta,
p_out_global,
p_ws_indices_global,
p_indices_global);
};
};
template <typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename ReduceOperation,
typename InElementwiseOperation,
typename OutElementwiseOperation,
bool PropagateNan,
bool BetaIsZero,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t InSrcVectorDim,
index_t InSrcVectorSize,
index_t OutDstVectorSize>
struct GridwiseReduction_mk_to_m_blockwise
{
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
static constexpr auto buffer_1d_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<BlockSize>{}));
template <typename T>
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
static constexpr auto I0 = Number<0>{};
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
__device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
const OutGridDesc_M& out_grid_desc_m,
const InElementwiseOperation& in_elementwise_op,
const OutElementwiseOperation& acc_elementwise_op,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType beta,
OutDataType* const __restrict__ p_out_global,
const IndexDataType* const __restrict__ p_ws_indices_global,
IndexDataType* const __restrict__ p_indices_global)
{
using BlockwiseReduce = PartitionedBlockwiseReductionOn1dBuffer<decltype(buffer_1d_desc),
AccDataType,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
reorder_thread_cluster,
ReduceOperation,
PropagateNan>;
using Accumulation =
detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
(void)p_ws_indices_global;
(void)p_indices_global;
// LDS
__shared__ AccDataType p_block_reduce_buffer[BlockSize];
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_out_global, out_grid_desc_m.GetElementSpaceSize());
auto block_reduce_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_buffer, BlockSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize);
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize)
: thread_local_id % KThreadClusterSize;
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(in_grid_desc_m_k,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
thread_k_cluster_id * KThreadSliceSize));
constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize;
index_t reducedTiles = 0;
do
{
threadwise_src_load.Run(in_grid_desc_m_k,
in_global_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
// do element-wise pre-reduction operation
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset));
});
// reduce on each thread-local slice
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]);
});
});
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
reducedTiles++;
} while(reducedTiles < toReduceTiles);
constexpr auto reduced_data_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if constexpr(reorder_thread_cluster)
{
block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) =
accu_value_buf[I];
}
else
block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) =
accu_value_buf[I];
accu_value_buf(I) = zeroVal;
__syncthreads();
BlockwiseReduce::Reduce(
block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id);
});
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if(thread_k_cluster_id == 0)
{
acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
accu_value_buf(I) *= alpha;
}
});
if(thread_k_cluster_id == 0)
{
if constexpr(!BetaIsZero)
{
if(!float_equal_zero{}(beta))
{
StaticBuffer<AddressSpaceEnum_t::Vgpr, OutDataType, MThreadSliceSize, true>
priorDstValueBuf;
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<OutDataType,
OutDataType,
OutGridDesc_M,
decltype(reduced_data_desc),
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
1,
false>(
out_grid_desc_m,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize));
threadwise_dst_load.Run(out_grid_desc_m,
out_global_buf,
reduced_data_desc,
make_tuple(I0),
priorDstValueBuf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I] * beta);
});
};
};
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
OutDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::Set,
1,
true>(
out_grid_desc_m,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize),
PassThroughOp<AccDataType>{});
threadwise_dst_store.Run(
reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);
}
};
__device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
const OutGridDesc_M& out_grid_desc_m,
const InElementwiseOperation& in_elementwise_op,
const OutElementwiseOperation& acc_elementwise_op,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType beta,
OutDataType* const __restrict__ p_out_global,
const IndexDataType* const __restrict__ p_ws_indices_global,
IndexDataType* const __restrict__ p_indices_global)
{
using BlockwiseReduceWithIndex =
PartitionedBlockwiseReductionWithIndexOn1dBuffer<decltype(buffer_1d_desc),
AccDataType,
IndexDataType,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
reorder_thread_cluster,
ReduceOperation,
PropagateNan>;
using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
ReduceOperation,
AccDataType,
IndexDataType>;
(void)p_ws_indices_global;
// LDS
__shared__ AccDataType p_block_reduce_val_buffer[BlockSize];
__shared__ IndexDataType p_block_reduce_idx_buffer[BlockSize];
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_out_global, out_grid_desc_m.GetElementSpaceSize());
auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_indices_global, out_grid_desc_m.GetElementSpaceSize());
auto block_reduce_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_val_buffer, BlockSize);
auto block_reduce_idx_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_idx_buffer, BlockSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_val_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, index_t, MThreadSliceSize * KThreadSliceSize, true>
in_thread_idx_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, IndexDataType, MThreadSliceSize, true>
accu_index_buf;
const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize);
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize)
: thread_local_id % KThreadClusterSize;
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(in_grid_desc_m_k,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
thread_k_cluster_id * KThreadSliceSize));
index_t indexOffset = 0;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) = zeroVal;
accu_index_buf(I) = 0;
});
constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize;
index_t reducedTiles = 0;
do
{
// load the thread slice
threadwise_src_load.Run(in_grid_desc_m_k,
in_global_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_val_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
// initialize the indices for the per-thread to-reduce values
in_thread_idx_buf(offset) =
indexOffset + thread_k_cluster_id * KThreadSliceSize + J();
// do element-wise pre-reduction operation
in_elementwise_op(in_thread_val_buf(offset), in_thread_val_buf(offset));
});
AccDataType tmpValue = zeroVal;
IndexDataType tmpIndex = 0;
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
// reduce on the dim1 thread slice
AccumulationWithIndex::Calculate(
tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]);
});
// store thread local value to LDS for parallel reduction
if constexpr(reorder_thread_cluster)
{
block_reduce_val_buf(thread_k_cluster_id * MThreadClusterSize +
thread_m_cluster_id) = tmpValue;
block_reduce_idx_buf(thread_k_cluster_id * MThreadClusterSize +
thread_m_cluster_id) = tmpIndex;
}
else
{
block_reduce_val_buf(thread_m_cluster_id * KThreadClusterSize +
thread_k_cluster_id) = tmpValue;
block_reduce_idx_buf(thread_m_cluster_id * KThreadClusterSize +
thread_k_cluster_id) = tmpIndex;
}
__syncthreads();
BlockwiseReduceWithIndex::Reduce(block_reduce_val_buf,
block_reduce_idx_buf,
tmpValue,
tmpIndex,
thread_m_cluster_id,
thread_k_cluster_id);
AccumulationWithIndex::Calculate(
accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex);
});
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
indexOffset += K_BlockTileSize;
reducedTiles++;
} while(reducedTiles < toReduceTiles);
constexpr auto reduced_data_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if(thread_k_cluster_id == 0)
{
// for indiced operation, acc_elementwise_op shoud do nothing
acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
accu_value_buf(I) *= alpha;
}
});
if(thread_k_cluster_id == 0)
{
if constexpr(!BetaIsZero)
{
if(!float_equal_zero{}(beta))
{
StaticBuffer<AddressSpaceEnum_t::Vgpr, OutDataType, MThreadSliceSize, true>
priorDstValueBuf;
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<OutDataType,
OutDataType,
OutGridDesc_M,
decltype(reduced_data_desc),
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
1,
false>(
out_grid_desc_m,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize));
threadwise_dst_load.Run(out_grid_desc_m,
out_global_val_buf,
reduced_data_desc,
make_tuple(I0),
priorDstValueBuf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I] * beta);
});
};
};
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
OutDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::Set,
1,
false>(
out_grid_desc_m,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize),
PassThroughOp<AccDataType>{});
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
IndexDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<index_t>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::Set,
1,
false>(
out_grid_desc_m,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize),
PassThroughOp<index_t>{});
threadwise_dst_val_store.Run(reduced_data_desc,
make_tuple(I0),
accu_value_buf,
out_grid_desc_m,
out_global_val_buf);
threadwise_dst_idx_store.Run(reduced_data_desc,
make_tuple(I0),
accu_index_buf,
out_grid_desc_m,
out_global_idx_buf);
}
};
__device__ static void
RunSecondCallWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
const OutGridDesc_M& out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const OutElementwiseOperation acc_elementwise_op,
AccDataType alpha,
const InDataType* const __restrict__ p_ws_values_global,
OutDataType beta,
OutDataType* const __restrict__ p_out_global,
const IndexDataType* const __restrict__ p_ws_indices_global,
IndexDataType* const __restrict__ p_indices_global)
{
using BlockwiseReduceWithIndex =
PartitionedBlockwiseReductionWithIndexOn1dBuffer<decltype(buffer_1d_desc),
AccDataType,
IndexDataType,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
reorder_thread_cluster,
ReduceOperation,
PropagateNan>;
using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
ReduceOperation,
AccDataType,
IndexDataType>;
(void)in_elementwise_op;
// LDS
__shared__ AccDataType p_block_reduce_val_buffer[BlockSize];
__shared__ IndexDataType p_block_reduce_idx_buffer[BlockSize];
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
const auto src_global_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Global>(p_ws_values_global,
in_grid_desc_m_k.GetElementSpaceSize(),
type_convert<InDataType>(zeroVal));
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_ws_indices_global, in_grid_desc_m_k.GetElementSpaceSize());
auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_out_global, out_grid_desc_m.GetElementSpaceSize());
auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_indices_global, out_grid_desc_m.GetElementSpaceSize());
auto block_reduce_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_val_buffer, BlockSize);
auto block_reduce_idx_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_idx_buffer, BlockSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_val_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr,
IndexDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_idx_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, IndexDataType, MThreadSliceSize, true>
accu_index_buf;
const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize);
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize)
: thread_local_id % KThreadClusterSize;
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(in_grid_desc_m_k,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
thread_k_cluster_id * KThreadSliceSize));
auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<
IndexDataType,
IndexDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(in_grid_desc_m_k,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
thread_k_cluster_id * KThreadSliceSize));
// index_t indexOffset = 0;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) = zeroVal;
accu_index_buf(I) = 0;
});
constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize;
index_t reducedTiles = 0;
do
{
// load the thread slice
threadwise_src_val_load.Run(in_grid_desc_m_k,
src_global_val_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_val_buf);
threadwise_src_idx_load.Run(in_grid_desc_m_k,
src_global_idx_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_idx_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
AccDataType tmpValue = zeroVal;
IndexDataType tmpIndex = 0;
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
// reduce on the dim1 thread slice
AccumulationWithIndex::Calculate(
tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]);
});
// store thread local value to LDS for parallel reduction
if constexpr(reorder_thread_cluster)
{
block_reduce_val_buf(thread_k_cluster_id * MThreadClusterSize +
thread_m_cluster_id) = tmpValue;
block_reduce_idx_buf(thread_k_cluster_id * MThreadClusterSize +
thread_m_cluster_id) = tmpIndex;
}
else
{
block_reduce_val_buf(thread_m_cluster_id * KThreadClusterSize +
thread_k_cluster_id) = tmpValue;
block_reduce_idx_buf(thread_m_cluster_id * KThreadClusterSize +
thread_k_cluster_id) = tmpIndex;
}
__syncthreads();
BlockwiseReduceWithIndex::Reduce(block_reduce_val_buf,
block_reduce_idx_buf,
tmpValue,
tmpIndex,
thread_m_cluster_id,
thread_k_cluster_id);
AccumulationWithIndex::Calculate(
accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex);
});
threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
// indexOffset += K_BlockTileSize;
reducedTiles++;
} while(reducedTiles < toReduceTiles);
constexpr auto reduced_data_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if(thread_k_cluster_id == 0)
{
// for indiced operation, acc_elementwise_op shoud do nothing
acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
accu_value_buf(I) *= alpha;
}
});
if(thread_k_cluster_id == 0)
{
if constexpr(!BetaIsZero)
{
if(!float_equal_zero{}(beta))
{
StaticBuffer<AddressSpaceEnum_t::Vgpr, OutDataType, MThreadSliceSize, true>
priorDstValueBuf;
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<OutDataType,
OutDataType,
OutGridDesc_M,
decltype(reduced_data_desc),
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
1,
true>(
out_grid_desc_m,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize));
threadwise_dst_load.Run(out_grid_desc_m,
out_global_val_buf,
reduced_data_desc,
make_tuple(I0),
priorDstValueBuf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I] * beta);
});
};
};
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
OutDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::Set,
1,
true>(
out_grid_desc_m,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize),
PassThroughOp<AccDataType>{});
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
IndexDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<IndexDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::Set,
1,
true>(
out_grid_desc_m,
make_multi_index(block_global_1d_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize),
PassThroughOp<index_t>{});
threadwise_dst_val_store.Run(reduced_data_desc,
make_tuple(I0),
accu_value_buf,
out_grid_desc_m,
out_global_val_buf);
threadwise_dst_idx_store.Run(reduced_data_desc,
make_tuple(I0),
accu_index_buf,
out_grid_desc_m,
out_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP
#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <typename GridwiseReduction,
typename InDataType,
typename OutDataType,
typename AccDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename InElementwiseOperation,
typename AccElementwiseOperation>
__global__ void
kernel_reduce_multiblock_atocmi_add(const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const AccElementwiseOperation acc_elementwise_op,
index_t block_group_size,
index_t num_k_block_tile_iteration,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType* const __restrict__ p_out_global)
{
GridwiseReduction::Run(in_grid_desc_m_k,
out_grid_desc_m,
in_elementwise_op,
acc_elementwise_op,
block_group_size,
num_k_block_tile_iteration,
alpha,
p_in_global,
p_out_global);
};
template <typename InDataType,
typename OutDataType,
typename AccDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename ReduceOperation,
typename InElementwiseOperation,
typename AccElementwiseOperation,
bool PropagateNan,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t InSrcVectorDim,
index_t InSrcVectorSize,
index_t OutDstVectorSize>
struct GridwiseReduction_mk_to_m_multiblock_atomic_add
{
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
static constexpr auto buffer_1d_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<BlockSize>{}));
using blockwise_reduce = PartitionedBlockwiseReductionOn1dBuffer<decltype(buffer_1d_desc),
AccDataType,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
reorder_thread_cluster,
ReduceOperation,
PropagateNan>;
template <typename T>
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
static constexpr auto I0 = Number<0>{};
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
__device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
const OutGridDesc_M& out_grid_desc_m,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op,
index_t block_group_size,
index_t num_k_block_tile_iteration,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType* const __restrict__ p_out_global)
{
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
// LDS
__shared__ AccDataType p_block_reduce_buffer[BlockSize];
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_out_global, out_grid_desc_m.GetElementSpaceSize());
auto block_reduce_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_buffer, BlockSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / block_group_size;
const index_t block_local_id = block_global_id % block_group_size;
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize);
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize)
: thread_local_id % KThreadClusterSize;
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(
in_grid_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
block_local_id * reduceSizePerBlock +
thread_k_cluster_id * KThreadSliceSize));
constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
index_t reducedTiles = 0;
do
{
threadwise_src_load.Run(in_grid_desc_m_k,
in_global_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
// do element-wise pre-reduction operation
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset));
});
// reduce on each thread-local slice
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]);
});
});
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
reducedTiles++;
} while(reducedTiles < num_k_block_tile_iteration);
constexpr auto reduced_data_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
// Each block executes multiple parallel reductions on the LDS, and by atomic-adding its
// reduced output to the global location corresponding to each invariant dimension to get a
// consistent reduced result for that invariant dimension. due to the using of vector_load,
// each block/thread is involved into multiple invarirant dimensions.
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if constexpr(reorder_thread_cluster)
{
block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) =
accu_value_buf[I];
}
else
block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) =
accu_value_buf[I];
accu_value_buf(I) = zeroVal;
__syncthreads();
blockwise_reduce::Reduce(
block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id);
});
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if(thread_k_cluster_id == 0)
{
acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
accu_value_buf(I) *= alpha;
}
});
if(thread_k_cluster_id == 0)
{
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
OutDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::AtomicAdd,
1,
true>(
out_grid_desc_m,
make_multi_index(blkgroup_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize),
PassThroughOp<AccDataType>{});
threadwise_dst_store.Run(
reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP
#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <typename GridwiseReduction,
bool NeedIndices,
typename InDataType,
typename AccDataType,
typename IndexDataType,
typename InGridDesc_M_K,
typename WorkspaceDesc_M_K,
typename InElementwiseOperation,
typename AccElementwiseOperation>
__global__ void
kernel_partial_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k,
const WorkspaceDesc_M_K workspace_desc_m_k,
const InElementwiseOperation in_elementwise_op,
const AccElementwiseOperation acc_elementwise_op,
index_t block_group_size,
index_t num_k_block_tile_iteration,
const InDataType* const __restrict__ p_src_global,
AccDataType* const __restrict__ p_ws_values_global,
IndexDataType* const __restrict__ p_ws_indices_global)
{
if constexpr(!NeedIndices)
{
GridwiseReduction::Run(in_grid_desc_m_k,
workspace_desc_m_k,
in_elementwise_op,
acc_elementwise_op,
block_group_size,
num_k_block_tile_iteration,
p_src_global,
p_ws_values_global,
p_ws_indices_global);
}
else
{
GridwiseReduction::RunWithIndex(in_grid_desc_m_k,
workspace_desc_m_k,
in_elementwise_op,
acc_elementwise_op,
block_group_size,
num_k_block_tile_iteration,
p_src_global,
p_ws_values_global,
p_ws_indices_global);
};
};
template <typename InDataType,
typename AccDataType,
typename IndexDataType,
typename InGridDesc_M_K,
typename WorkspaceDesc_M_K,
typename ReduceOperation,
typename InElementwiseOperation,
typename AccElementwiseOperation,
bool PropagateNan,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t InSrcVectorDim,
index_t InSrcVectorSize,
index_t OutDstVectorSize>
struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
{
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
static constexpr auto buffer1dDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<BlockSize>{}));
template <typename T>
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
static constexpr auto I0 = Number<0>{};
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
__device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
const WorkspaceDesc_M_K& workspace_desc_m_k,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op,
index_t block_group_size,
index_t num_k_block_tile_iteration,
const InDataType* const __restrict__ p_src_global,
AccDataType* const __restrict__ p_ws_values_global,
IndexDataType* const __restrict__ p_ws_indices_global)
{
using BlockwiseReduce = PartitionedBlockwiseReductionOn1dBuffer<decltype(buffer1dDesc),
AccDataType,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
reorder_thread_cluster,
ReduceOperation,
PropagateNan>;
using Accumulation =
detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
(void)p_ws_indices_global;
(void)acc_elementwise_op;
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
// LDS
__shared__ AccDataType p_block_reduce_buffer[BlockSize];
const auto in_global_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Global>(p_src_global,
in_grid_desc_m_k.GetElementSpaceSize(),
type_convert<InDataType>(zeroVal));
auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize());
auto block_reduce_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_buffer, BlockSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / block_group_size;
const index_t block_local_id = block_global_id % block_group_size;
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize);
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize)
: thread_local_id % KThreadClusterSize;
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(
in_grid_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
block_local_id * reduceSizePerBlock +
thread_k_cluster_id * KThreadSliceSize));
constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
index_t reducedTiles = 0;
do
{
threadwise_src_load.Run(in_grid_desc_m_k,
in_global_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
// do element-wise pre-reduction operation
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset));
});
// reduce on each thread-local slice
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]);
});
});
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
reducedTiles++;
} while(reducedTiles < num_k_block_tile_iteration);
constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
// Each block executes multiple parallel reductions on the LDS, and due to the using of
// vector_load, each block/thread is involved into multiple invarirant dimensions.
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if constexpr(reorder_thread_cluster)
{
block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) =
accu_value_buf[I];
}
else
block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) =
accu_value_buf[I];
accu_value_buf(I) = zeroVal;
__syncthreads();
BlockwiseReduce::Reduce(
block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id);
});
if(thread_k_cluster_id == 0)
{
auto threadwise_workspace_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
AccDataType,
decltype(reduced_data_desc),
WorkspaceDesc_M_K,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize, 1>,
Sequence<0, 1>,
1,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(
workspace_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
block_local_id),
PassThroughOp<AccDataType>{});
threadwise_workspace_store.Run(reduced_data_desc,
make_tuple(I0, I0),
accu_value_buf,
workspace_desc_m_k,
workspace_global_buf);
}
};
__device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
const WorkspaceDesc_M_K& workspace_desc_m_k,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op,
index_t block_group_size,
index_t num_k_block_tile_iteration,
const InDataType* const __restrict__ p_src_global,
AccDataType* const __restrict__ p_ws_values_global,
IndexDataType* const __restrict__ p_ws_indices_global)
{
using BlockwiseReduceWithIndex =
PartitionedBlockwiseReductionWithIndexOn1dBuffer<decltype(buffer1dDesc),
AccDataType,
IndexDataType,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
reorder_thread_cluster,
ReduceOperation,
PropagateNan>;
using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
ReduceOperation,
AccDataType,
IndexDataType>;
(void)acc_elementwise_op;
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
// LDS
__shared__ AccDataType p_block_reduce_val_buffer[BlockSize];
__shared__ index_t p_block_reduce_idx_buffer[BlockSize];
const auto in_global_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Global>(p_src_global,
in_grid_desc_m_k.GetElementSpaceSize(),
type_convert<InDataType>(zeroVal));
auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize());
auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_ws_indices_global, workspace_desc_m_k.GetElementSpaceSize());
auto block_reduce_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_val_buffer, BlockSize);
auto block_reduce_idx_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_idx_buffer, BlockSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_val_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr,
IndexDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_idx_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, IndexDataType, MThreadSliceSize, true>
accu_index_buf;
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / block_group_size;
const index_t block_local_id = block_global_id % block_group_size;
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize);
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize)
: thread_local_id % KThreadClusterSize;
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(
in_grid_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
block_local_id * reduceSizePerBlock +
thread_k_cluster_id * KThreadSliceSize));
constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
index_t indexOffset = block_local_id * reduceSizePerBlock;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) = zeroVal;
accu_index_buf(I) = 0;
});
index_t reducedTiles = 0;
do
{
// load the thread slice
threadwise_src_load.Run(in_grid_desc_m_k,
in_global_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_val_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
// initialize the indices for the per-thread to-reduce values
in_thread_idx_buf(offset) =
indexOffset + thread_k_cluster_id * KThreadSliceSize + J();
// do element-wise pre-reduction operation
in_elementwise_op(in_thread_val_buf(offset), in_thread_val_buf(offset));
});
AccDataType tmpValue = zeroVal;
IndexDataType tmpIndex = 0;
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
// reduce on the dim1 thread slice
AccumulationWithIndex::Calculate(
tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]);
});
// store thread local value to LDS for parallel reduction
if constexpr(reorder_thread_cluster)
{
block_reduce_val_buf(thread_k_cluster_id * MThreadClusterSize +
thread_m_cluster_id) = tmpValue;
block_reduce_idx_buf(thread_k_cluster_id * MThreadClusterSize +
thread_m_cluster_id) = tmpIndex;
}
else
{
block_reduce_val_buf(thread_m_cluster_id * KThreadClusterSize +
thread_k_cluster_id) = tmpValue;
block_reduce_idx_buf(thread_m_cluster_id * KThreadClusterSize +
thread_k_cluster_id) = tmpIndex;
}
__syncthreads();
BlockwiseReduceWithIndex::Reduce(block_reduce_val_buf,
block_reduce_idx_buf,
tmpValue,
tmpIndex,
thread_m_cluster_id,
thread_k_cluster_id);
AccumulationWithIndex::Calculate(
accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex);
});
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
indexOffset += K_BlockTileSize;
reducedTiles++;
} while(reducedTiles < num_k_block_tile_iteration);
constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
if(thread_k_cluster_id == 0)
{
auto threadwise_workspace_val_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
AccDataType,
decltype(reduced_data_desc),
WorkspaceDesc_M_K,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize, 1>,
Sequence<0, 1>,
1,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(
workspace_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
block_local_id),
PassThroughOp<AccDataType>{});
auto threadwise_workspace_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
IndexDataType,
decltype(reduced_data_desc),
WorkspaceDesc_M_K,
PassThroughOp<IndexDataType>,
Sequence<MThreadSliceSize, 1>,
Sequence<0, 1>,
1,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(
workspace_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
block_local_id),
PassThroughOp<IndexDataType>{});
threadwise_workspace_val_store.Run(reduced_data_desc,
make_tuple(I0, I0),
accu_value_buf,
workspace_desc_m_k,
workspace_global_val_buf);
threadwise_workspace_idx_store.Run(reduced_data_desc,
make_tuple(I0, I0),
accu_index_buf,
workspace_desc_m_k,
workspace_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#define CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <typename GridwiseReduction,
bool NeedIndices,
typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename InElementwiseOperation,
typename AccElementwiseOperation>
__global__ void kernel_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const AccElementwiseOperation acc_elementwise_op,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType beta,
OutDataType* const __restrict__ p_out_global,
IndexDataType* const __restrict__ p_indices_global)
{
if constexpr(!NeedIndices)
{
GridwiseReduction::Run(in_grid_desc_m_k,
out_grid_desc_m,
in_elementwise_op,
acc_elementwise_op,
alpha,
p_in_global,
beta,
p_out_global,
p_indices_global);
}
else
{
GridwiseReduction::RunWithIndices(in_grid_desc_m_k,
out_grid_desc_m,
in_elementwise_op,
acc_elementwise_op,
alpha,
p_in_global,
beta,
p_out_global,
p_indices_global);
};
};
template <typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename ReduceOperation,
typename InElementwiseOperation,
typename AccElementwiseOperation,
bool PropagateNan,
bool BetaIsZero,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t InSrcVectorDim,
index_t InSrcVectorSize,
index_t OutDstVectorSize>
struct GridwiseReduction_mk_to_m_threadwise
{
template <typename T>
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
static constexpr auto I0 = Number<0>{};
__device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
const OutGridDesc_M& out_grid_desc_m,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType beta,
OutDataType* const __restrict__ p_out_global,
IndexDataType* const __restrict__ p_indices_global)
{
using Accumulation =
detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
(void)p_indices_global;
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_out_global, out_grid_desc_m.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
index_t reducedLength = 0;
do
{
threadwise_src_load.Run(in_grid_desc_m_k,
in_global_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
// do element-wise pre-reduction operation
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset));
});
// reduce on each thread-local slice
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]);
});
});
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
reducedLength += KThreadSliceSize;
} while(reducedLength < toReduceLength);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
accu_value_buf(I) *= alpha;
});
constexpr auto reduced_data_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
if constexpr(!BetaIsZero)
{
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<OutDataType,
OutDataType,
OutGridDesc_M,
decltype(reduced_data_desc),
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
1,
1,
true>(
out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize));
StaticBuffer<AddressSpaceEnum_t::Vgpr, OutDataType, MThreadSliceSize, true>
priorDstValue_buf;
threadwise_dst_load.Run(out_grid_desc_m,
dst_global_buf,
reduced_data_desc,
make_tuple(I0),
priorDstValue_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I] * beta);
});
};
};
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
OutDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::Set,
1,
false>(
out_grid_desc_m,
make_multi_index(thread_global_1d_id * MThreadSliceSize),
PassThroughOp<AccDataType>{});
threadwise_dst_store.Run(
reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, dst_global_buf);
};
__device__ static void RunWithIndices(const InGridDesc_M_K& in_grid_desc_m_k,
const OutGridDesc_M& out_grid_desc_m,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType beta,
OutDataType* const __restrict__ p_out_global,
IndexDataType* const __restrict__ p_indices_global)
{
using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
ReduceOperation,
AccDataType,
IndexDataType>;
(void)acc_elementwise_op;
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_out_global, out_grid_desc_m.GetElementSpaceSize());
auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_indices_global, out_grid_desc_m.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, IndexDataType, MThreadSliceSize, true>
accu_index_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) = zeroVal;
accu_index_buf(I) = 0;
});
const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
index_t indexStart = 0;
index_t reducedLength = 0;
do
{
threadwise_src_load.Run(in_grid_desc_m_k,
in_global_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
// do element-wise pre-reduction operation
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset));
});
// reduce on each thread-local slice
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
AccumulationWithIndex::Calculate(accu_value_buf(I),
in_thread_buf[offset],
accu_index_buf(I),
indexStart + J);
});
});
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
indexStart += KThreadSliceSize;
reducedLength += KThreadSliceSize;
} while(reducedLength < toReduceLength);
// for indiced operation, acc_elementwise_op shoud do nothing
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
accu_value_buf(I) *= alpha;
});
constexpr auto reduced_data_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
if constexpr(!BetaIsZero)
{
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<OutDataType,
OutDataType,
OutGridDesc_M,
decltype(reduced_data_desc),
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
1,
1,
false>(
out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize));
StaticBuffer<AddressSpaceEnum_t::Vgpr, OutDataType, MThreadSliceSize, true>
priorDstValue_buf;
threadwise_dst_load.Run(out_grid_desc_m,
out_global_val_buf,
reduced_data_desc,
make_tuple(I0),
priorDstValue_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I] * beta);
});
};
};
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
OutDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::Set,
1,
false>(
out_grid_desc_m,
make_multi_index(thread_global_1d_id * MThreadSliceSize),
PassThroughOp<AccDataType>{});
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
IndexDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<IndexDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::Set,
1,
false>(
out_grid_desc_m,
make_multi_index(thread_global_1d_id * MThreadSliceSize),
PassThroughOp<IndexDataType>{});
threadwise_dst_val_store.Run(
reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_val_buf);
threadwise_dst_idx_store.Run(
reduced_data_desc, make_tuple(I0), accu_index_buf, out_grid_desc_m, out_global_idx_buf);
};
};
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment