"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "b384c4e78b4549d99721b9eaf03ce16d8caba0d7"
Unverified Commit 827301d9 authored by Qianfeng's avatar Qianfeng Committed by GitHub
Browse files

Pr82 followup (#115)

* Use thread cluster descriptor and explicit M_K 2d descriptor to simply Blockwise Reduction

* Change by replacing ReduceDims by NumReduceDims as Device Reduce interface template parameter

* Rename the folder name for the pool2d and reduce examples

* Update to reduction test scripts

* Add Readme for pool2d_fwd and reduce_blockwise examples

* Tiny fix in reduce profiler and tiny update in reduce testing scripts

* Tiny fix in testing script profile_reduce_no_index.sh

* Tiny change in script/profile_reduce_with_index.sh

* Renaming and refining in Reduction profiler/device layer/examples

* Renaming and refining in Reduction profiler/device layer/examples

* Renaming all NumReduceDims to NumReduceDim
parent 5d37d7bf
# Instructions for ```reduce_blockwise``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```reduce_blockwise```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j reduce_blockwise
```
## Run ```reduce_blockwise```
```bash
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=integer value, 2=decimal value)
#arg2: run kernel # of times (>1)
./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
```
Result
```
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 3 times...
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 529, 529
root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 10 times...
Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 528, 528
```
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "device_reduce_blockwise.hpp" #include "device_reduce_blockwise.hpp"
#include "host_reduce_util.hpp" #include "host_reduce_util.hpp"
#include "host_generic_reduction.hpp" #include "host_generic_reduction.hpp"
#include "reduction_enums.hpp" #include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp" #include "reduction_operator_mapping.hpp"
...@@ -28,8 +29,8 @@ using kInDataType = ck::half_t; ...@@ -28,8 +29,8 @@ using kInDataType = ck::half_t;
using kOutDataType = ck::half_t; using kOutDataType = ck::half_t;
using kAccDataType = float; using kAccDataType = float;
constexpr int Rank = 4; constexpr int Rank = 4;
using ReduceDims_ = ck::Sequence<0, 1, 2>; constexpr int NumReduceDim = 3;
constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2; constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN; constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN;
...@@ -46,7 +47,7 @@ using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType, ...@@ -46,7 +47,7 @@ using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
kAccDataType, kAccDataType,
kOutDataType, kOutDataType,
Rank, Rank,
ReduceDims_, NumReduceDim,
ReduceOperation, ReduceOperation,
InElementwiseOperation, InElementwiseOperation,
AccElementwiseOperation, AccElementwiseOperation,
...@@ -192,39 +193,13 @@ class SimpleAppArgs ...@@ -192,39 +193,13 @@ class SimpleAppArgs
}; };
}; };
template <int Rank, typename ReduceDims>
static std::vector<int> get_reduce_dims()
{
std::vector<int> resDims;
static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); });
return (resDims);
};
template <int Rank, typename ReduceDims>
static std::vector<int> get_invariant_dims()
{
std::vector<int> resDims;
unsigned int incFlag = 0;
static_for<0, ReduceDims::Size(), 1>{}(
[&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); });
for(int dim = 0; dim < Rank; dim++)
{
if(incFlag & (0x1 << dim))
continue;
resDims.push_back(dim);
};
return (resDims);
};
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck::host_reduce; using namespace ck::host_reduce;
const std::vector<int> reduceDims{0, 1, 2};
const std::vector<int> invariantDims{3};
SimpleAppArgs args; SimpleAppArgs args;
if(args.processArgs(argc, argv) < 0) if(args.processArgs(argc, argv) < 0)
...@@ -260,15 +235,12 @@ int main(int argc, char* argv[]) ...@@ -260,15 +235,12 @@ int main(int argc, char* argv[])
Tensor<InDataType> in(args.inLengths); Tensor<InDataType> in(args.inLengths);
const std::vector<int> InvariantDims = get_invariant_dims<Rank, ReduceDims_>();
const std::vector<int> ReduceDims = get_reduce_dims<Rank, ReduceDims_>();
std::vector<size_t> outLengths; std::vector<size_t> outLengths;
if(InvariantDims.empty()) if(invariantDims.empty())
outLengths.push_back(1); outLengths.push_back(1);
else else
for(auto dim : InvariantDims) for(auto dim : invariantDims)
outLengths.push_back(args.inLengths[dim]); outLengths.push_back(args.inLengths[dim]);
Tensor<OutDataType> out_ref(outLengths); Tensor<OutDataType> out_ref(outLengths);
...@@ -328,7 +300,7 @@ int main(int argc, char* argv[]) ...@@ -328,7 +300,7 @@ int main(int argc, char* argv[])
if(args.do_verification) if(args.do_verification)
{ {
ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices> ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, InvariantDims, ReduceDims); hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run( hostReduce.Run(
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
...@@ -350,6 +322,7 @@ int main(int argc, char* argv[]) ...@@ -350,6 +322,7 @@ int main(int argc, char* argv[])
i_inStrides, i_inStrides,
i_outLengths, i_outLengths,
i_outStrides, i_outStrides,
reduceDims,
alpha, alpha,
beta, beta,
in_dev.GetDeviceBuffer(), in_dev.GetDeviceBuffer(),
......
# Instructions for ```pool2d_fwd``` Example
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```pool2d_fwd```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j pool2d_fwd
```
## Run ```pool2d_fwd```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./example/pool2d_fwd 1 1 10
```
Result
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up
Start running 10 times...
Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
error: 0
max_diff: 0, 1, 1
```
...@@ -32,57 +32,53 @@ ...@@ -32,57 +32,53 @@
#include "reduction_operator.hpp" #include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp" #include "reduction_functions_accumulate.hpp"
#include "cluster_descriptor.hpp"
namespace ck { namespace ck {
template <typename Buffer1dDescType, template <typename AccDataType,
typename AccDataType,
index_t BlockSize, index_t BlockSize,
index_t MThreadClusterSize, typename ThreadClusterLengths_M_K,
index_t KThreadClusterSize, typename ThreadClusterArrangeOrder,
bool ReorderThreadClusters,
typename OpReduce, typename OpReduce,
bool PropagateNan> bool PropagateNan>
struct PartitionedBlockwiseReductionOn1dBuffer struct PartitionedBlockwiseReduction
{ {
static constexpr auto buffer_1d_desc = Buffer1dDescType{}; static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
"The product of cluster lengths should be same as BlockSize!"); "The product of cluster lengths should be same as BlockSize!");
static_assert(KThreadClusterSize > 1, "Parallel reduction need work on at least two elements");
static_assert(buffer_1d_desc.GetElementSize() == BlockSize, static constexpr auto BufferLength_M = ThreadClusterLengths_M_K::At(0);
"The buffer size should be the same as BlockSize!"); static constexpr auto BufferLength_K = ThreadClusterLengths_M_K::At(1);
static_assert(BufferLength_K > 1, "Parallel reduction need work on at least two elements");
static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
make_tuple(Number<BufferLength_M>{}, Number<BufferLength_K>{}));
static constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>; using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
template <typename BufferType> template <typename BufferType>
__device__ static void Reduce(BufferType& block_buffer, __device__ static void Reduce(BufferType& block_buffer, AccDataType& accuData)
AccDataType& accuData,
index_t thread_m_cluster_id,
index_t thread_k_cluster_id)
{ {
constexpr auto cluster_len_shift = get_shift<KThreadClusterSize>(); constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
const auto thread_cluster_idx =
thread_cluster_desc.CalculateBottomIndex(make_multi_index(get_thread_local_1d_id()));
const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
static_for<0, cluster_len_shift, 1>{}([&](auto I) { static_for<0, cluster_len_shift, 1>{}([&](auto I) {
constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I()); constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I());
if(thread_k_cluster_id < indOffset) if(thread_k_cluster_id < indOffset)
{ {
// consider the thread clusters order, ensure the contiguous locations are accessed index_t offset1 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx);
// by contiguous Thread-ID index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
index_t offset1 = make_tuple(0, indOffset));
ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(
thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(make_tuple(
thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id));
index_t offset2 = ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(
(thread_k_cluster_id + indOffset) * MThreadClusterSize +
thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(
make_tuple(thread_m_cluster_id * KThreadClusterSize +
(thread_k_cluster_id + indOffset)));
AccDataType opData1 = type_convert<AccDataType>(block_buffer[offset1]); AccDataType opData1 = type_convert<AccDataType>(block_buffer[offset1]);
AccDataType opData2 = type_convert<AccDataType>(block_buffer[offset2]); AccDataType opData2 = type_convert<AccDataType>(block_buffer[offset2]);
...@@ -93,34 +89,34 @@ struct PartitionedBlockwiseReductionOn1dBuffer ...@@ -93,34 +89,34 @@ struct PartitionedBlockwiseReductionOn1dBuffer
__syncthreads(); __syncthreads();
}); });
index_t offset = ReorderThreadClusters index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0));
? buffer_1d_desc.CalculateOffset(make_tuple(thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(
make_tuple(thread_m_cluster_id * KThreadClusterSize));
accuData = type_convert<AccDataType>(block_buffer[offset]); accuData = type_convert<AccDataType>(block_buffer[offset]);
}; };
}; };
template <typename Buffer1dDescType, template <typename AccDataType,
typename AccDataType,
typename IndexDataType, typename IndexDataType,
index_t BlockSize, index_t BlockSize,
index_t MThreadClusterSize, typename ThreadClusterLengths_M_K,
index_t KThreadClusterSize, typename ThreadClusterArrangeOrder,
bool ReorderThreadClusters,
typename OpReduce, typename OpReduce,
bool PropagateNan> bool PropagateNan>
struct PartitionedBlockwiseReductionWithIndexOn1dBuffer struct PartitionedBlockwiseReductionWithIndex
{ {
static constexpr auto buffer_1d_desc = Buffer1dDescType{}; static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
"The product of cluster lengths should be same as BlockSize!"); "The product of cluster lengths should be same as BlockSize!");
static_assert(KThreadClusterSize > 1, "Parallel reduction need work on at least two elements");
static_assert(buffer_1d_desc.GetElementSize() == BlockSize, static constexpr auto BufferLength_M = ThreadClusterLengths_M_K::At(0);
"The buffer size should be the same as BlockSize!"); static constexpr auto BufferLength_K = ThreadClusterLengths_M_K::At(1);
static_assert(BufferLength_K > 1, "Parallel reduction need work on at least two elements");
static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
make_tuple(Number<BufferLength_M>{}, Number<BufferLength_K>{}));
static constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
using Accumulation = using Accumulation =
detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>; detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
...@@ -130,32 +126,24 @@ struct PartitionedBlockwiseReductionWithIndexOn1dBuffer ...@@ -130,32 +126,24 @@ struct PartitionedBlockwiseReductionWithIndexOn1dBuffer
__device__ static void Reduce(BufferType& block_val_buffer, __device__ static void Reduce(BufferType& block_val_buffer,
IdxBufferType& block_idx_buffer, IdxBufferType& block_idx_buffer,
AccDataType& accuData, AccDataType& accuData,
IndexDataType& accuIndex, IndexDataType& accuIndex)
index_t thread_m_cluster_id,
index_t thread_k_cluster_id)
{ {
constexpr auto cluster_len_shift = get_shift<KThreadClusterSize>(); constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
const auto thread_cluster_idx =
thread_cluster_desc.CalculateBottomIndex(make_multi_index(get_thread_local_1d_id()));
const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
static_for<0, cluster_len_shift, 1>{}([&](auto I) { static_for<0, cluster_len_shift, 1>{}([&](auto I) {
constexpr index_t indOffset = 1 << I(); constexpr index_t indOffset = 1 << I();
if(thread_k_cluster_id % (indOffset * 2) == 0) if(thread_k_cluster_id % (indOffset * 2) == 0)
{ {
// consider the thread clusters order, ensure the contiguous locations are accessed index_t offset1 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx);
// by contiguous Thread-ID index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
index_t offset1 = make_tuple(0, indOffset));
ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(
thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(make_tuple(
thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id));
index_t offset2 = ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(
(thread_k_cluster_id + indOffset) * MThreadClusterSize +
thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(
make_tuple(thread_m_cluster_id * KThreadClusterSize +
(thread_k_cluster_id + indOffset)));
AccDataType opData1 = type_convert<AccDataType>(block_val_buffer[offset1]); AccDataType opData1 = type_convert<AccDataType>(block_val_buffer[offset1]);
AccDataType opData2 = type_convert<AccDataType>(block_val_buffer[offset2]); AccDataType opData2 = type_convert<AccDataType>(block_val_buffer[offset2]);
...@@ -170,10 +158,7 @@ struct PartitionedBlockwiseReductionWithIndexOn1dBuffer ...@@ -170,10 +158,7 @@ struct PartitionedBlockwiseReductionWithIndexOn1dBuffer
__syncthreads(); __syncthreads();
}); });
index_t offset = ReorderThreadClusters index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0));
? buffer_1d_desc.CalculateOffset(make_tuple(thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(
make_tuple(thread_m_cluster_id * KThreadClusterSize));
accuData = type_convert<AccDataType>(block_val_buffer[offset]); accuData = type_convert<AccDataType>(block_val_buffer[offset]);
accuIndex = block_idx_buffer[offset]; accuIndex = block_idx_buffer[offset];
......
...@@ -36,14 +36,15 @@ struct DeviceReduce : public BaseOperator ...@@ -36,14 +36,15 @@ struct DeviceReduce : public BaseOperator
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const void* in_dev, const void* in_dev,
void* out_dev, void* out_dev,
void* out_indices_dev, void* out_indices_dev,
void* workspace_dev, void* workspace_dev,
const InElementwiseOperation& inElementwiseOp, const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& accElementwiseOp) = 0; const AccElementwiseOperation& acc_elementwise_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0; virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
}; };
......
...@@ -15,8 +15,8 @@ namespace device { ...@@ -15,8 +15,8 @@ namespace device {
template <typename InDataType, template <typename InDataType,
typename AccDataType, typename AccDataType,
typename OutDataType, typename OutDataType,
int Rank, index_t Rank,
typename ReduceDims, index_t NumReduceDim,
typename ReduceOperation, typename ReduceOperation,
typename InElementwiseOperation, typename InElementwiseOperation,
typename AccElementwiseOperation, typename AccElementwiseOperation,
...@@ -40,7 +40,12 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl ...@@ -40,7 +40,12 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
static constexpr bool BetaIsZero = NeedIndices; static constexpr bool BetaIsZero = NeedIndices;
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>()); static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
using InvariantDims =
typename conditional<NumInvariantDim == 0,
Sequence<>,
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
static constexpr index_t srcDims = Rank; static constexpr index_t srcDims = Rank;
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
...@@ -74,7 +79,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl ...@@ -74,7 +79,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
} }
else else
{ {
const auto toReduceDimLengths = const auto reduceDimLengths =
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
const auto invariantDimLengths = const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
...@@ -82,7 +87,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl ...@@ -82,7 +87,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
return transform_tensor_descriptor( return transform_tensor_descriptor(
inDesc, inDesc,
make_tuple(make_merge_transform(invariantDimLengths), make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)), make_merge_transform(reduceDimLengths)),
make_tuple(InvariantDims{}, ReduceDims{}), make_tuple(InvariantDims{}, ReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
} }
...@@ -136,6 +141,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl ...@@ -136,6 +141,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const InDataType* in_dev, const InDataType* in_dev,
...@@ -144,30 +150,31 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl ...@@ -144,30 +150,31 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
AccDataType* workspace_dev, AccDataType* workspace_dev,
const InElementwiseOperation& in_elementwise_op, const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op) const AccElementwiseOperation& acc_elementwise_op)
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev} : outLengths_{outLengths},
outStrides_{outStrides},
in_dev_{in_dev},
out_dev_{out_dev},
out_indices_dev_{out_indices_dev},
in_elementwise_op_{in_elementwise_op},
acc_elementwise_op_{acc_elementwise_op}
{ {
(void)workspace_dev; (void)workspace_dev;
inLengths_ = inLengths; std::tie(inLengths_, inStrides_) =
inStrides_ = inStrides; shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
outLengths_ = outLengths;
outStrides_ = outStrides;
in_elementwise_op_ = in_elementwise_op;
acc_elementwise_op_ = acc_elementwise_op;
alpha_ = static_cast<AccDataType>(alpha); alpha_ = static_cast<AccDataType>(alpha);
beta_ = static_cast<OutDataType>(beta); beta_ = static_cast<OutDataType>(beta);
std::tie(invariant_total_length, reduce_total_length) = std::tie(invariant_total_length, reduce_total_length) =
get_2d_lengths<Rank, ReduceDims>(inLengths); get_2d_lengths<Rank, ReduceDims>(inLengths_);
if constexpr(InvariantDims::Size() == 0) if constexpr(InvariantDims::Size() == 0)
invariant_lowest_length = 1; invariant_lowest_length = 1;
else else
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)]; invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)]; reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
M_BlockTileSize; M_BlockTileSize;
...@@ -305,6 +312,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl ...@@ -305,6 +312,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const void* in_dev, const void* in_dev,
...@@ -318,6 +326,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl ...@@ -318,6 +326,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
inStrides, inStrides,
outLengths, outLengths,
outStrides, outStrides,
reduceDims,
alpha, alpha,
beta, beta,
static_cast<const InDataType*>(in_dev), static_cast<const InDataType*>(in_dev),
......
...@@ -15,8 +15,8 @@ namespace device { ...@@ -15,8 +15,8 @@ namespace device {
template <typename InDataType, template <typename InDataType,
typename AccDataType, typename AccDataType,
typename OutDataType, typename OutDataType,
int Rank, index_t Rank,
typename ReduceDims, index_t NumReduceDim,
typename ReduceOperation, typename ReduceOperation,
typename InElementwiseOperation, typename InElementwiseOperation,
typename AccElementwiseOperation, typename AccElementwiseOperation,
...@@ -45,7 +45,11 @@ struct DeviceReduceBlockWiseSecondCall ...@@ -45,7 +45,11 @@ struct DeviceReduceBlockWiseSecondCall
std::is_same<InDataType, AccDataType>::value, std::is_same<InDataType, AccDataType>::value,
"InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!"); "InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!");
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>()); static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
using InvariantDims =
typename conditional<NumInvariantDim == 0,
Sequence<>,
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
...@@ -117,16 +121,16 @@ struct DeviceReduceBlockWiseSecondCall ...@@ -117,16 +121,16 @@ struct DeviceReduceBlockWiseSecondCall
AccDataType* workspace_dev, AccDataType* workspace_dev,
const InElementwiseOperation& in_elementwise_op, const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op) const AccElementwiseOperation& acc_elementwise_op)
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev} : inLengths_(inLengths),
inStrides_(inStrides),
outLengths_(outLengths),
outStrides_(outStrides),
in_dev_{in_dev},
out_dev_{out_dev},
out_indices_dev_{out_indices_dev},
in_elementwise_op_(in_elementwise_op),
acc_elementwise_op_(acc_elementwise_op)
{ {
inLengths_ = inLengths;
inStrides_ = inStrides;
outLengths_ = outLengths;
outStrides_ = outStrides;
in_elementwise_op_ = in_elementwise_op;
acc_elementwise_op_ = acc_elementwise_op;
alpha_ = static_cast<AccDataType>(alpha); alpha_ = static_cast<AccDataType>(alpha);
beta_ = static_cast<OutDataType>(beta); beta_ = static_cast<OutDataType>(beta);
...@@ -268,6 +272,7 @@ struct DeviceReduceBlockWiseSecondCall ...@@ -268,6 +272,7 @@ struct DeviceReduceBlockWiseSecondCall
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const void* in_dev, const void* in_dev,
...@@ -277,6 +282,8 @@ struct DeviceReduceBlockWiseSecondCall ...@@ -277,6 +282,8 @@ struct DeviceReduceBlockWiseSecondCall
const InElementwiseOperation& in_elementwise_op, const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op) override const AccElementwiseOperation& acc_elementwise_op) override
{ {
(void)reduceDims;
return std::make_unique<Argument>(inLengths, return std::make_unique<Argument>(inLengths,
inStrides, inStrides,
outLengths, outLengths,
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define DEVICE_REDUCE_COMMON_HPP #define DEVICE_REDUCE_COMMON_HPP
#include <vector> #include <vector>
#include <cassert>
#include "common_header.hpp" #include "common_header.hpp"
#include "reduction_enums.hpp" #include "reduction_enums.hpp"
...@@ -40,23 +41,6 @@ constexpr bool belong() ...@@ -40,23 +41,6 @@ constexpr bool belong()
return (inside); return (inside);
}; };
template <int Rank, typename ReduceDims, int start = 0>
constexpr auto get_invariant_dims()
{
static_assert(Rank <= 6, "bigger Rank size not supported!");
if constexpr(start >= Rank)
return Sequence<>{};
else
{
if constexpr(!belong<start, ReduceDims>())
return merge_sequences(Sequence<start>{},
get_invariant_dims<Rank, ReduceDims, start + 1>());
else
return get_invariant_dims<Rank, ReduceDims, start + 1>();
};
};
// helper functions using variadic template arguments // helper functions using variadic template arguments
template <index_t... Ns> template <index_t... Ns>
static auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>) static auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
...@@ -74,6 +58,45 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS ...@@ -74,6 +58,45 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
return make_tuple_from_array_and_index_seq(lengths, index_seq); return make_tuple_from_array_and_index_seq(lengths, index_seq);
}; };
template <index_t Rank, index_t NumReduceDim>
static inline std::pair<std::vector<int>, std::vector<int>>
shuffle_tensor_dimensions(const std::vector<int>& dimLengths,
const std::vector<int>& dimStrides,
const std::vector<int>& reduceDims)
{
std::vector<int> newDimLengths;
std::vector<int> newDimStrides;
assert(Rank == dimLengths.size() && Rank == dimStrides.size() &&
NumReduceDim == reduceDims.size());
int reduceFlag = 0;
// flag the bits for the reduceDims
for(int i = 0; i < NumReduceDim; i++)
{
reduceFlag |= 1 << reduceDims[i];
};
// collect invariant dimensions
for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0)
{
newDimLengths.push_back(dimLengths[i]);
newDimStrides.push_back(dimStrides[i]);
};
// collect reduce dimensions
for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) > 0)
{
newDimLengths.push_back(dimLengths[i]);
newDimStrides.push_back(dimStrides[i]);
};
return std::make_pair(newDimLengths, newDimStrides);
};
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
......
...@@ -17,8 +17,8 @@ namespace device { ...@@ -17,8 +17,8 @@ namespace device {
template <typename InDataType, template <typename InDataType,
typename AccDataType, typename AccDataType,
typename OutDataType, typename OutDataType,
int Rank, index_t Rank,
typename ReduceDims, index_t NumReduceDim,
typename ReduceOperation, typename ReduceOperation,
typename InElementwiseOperation, typename InElementwiseOperation,
typename AccElementwiseOperation, typename AccElementwiseOperation,
...@@ -41,7 +41,12 @@ struct DeviceReduceMultiBlockAtomicAdd ...@@ -41,7 +41,12 @@ struct DeviceReduceMultiBlockAtomicAdd
using IndexDataType = int32_t; using IndexDataType = int32_t;
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>()); static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
using InvariantDims =
typename conditional<NumInvariantDim == 0,
Sequence<>,
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
static constexpr index_t srcDims = Rank; static constexpr index_t srcDims = Rank;
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
...@@ -84,7 +89,7 @@ struct DeviceReduceMultiBlockAtomicAdd ...@@ -84,7 +89,7 @@ struct DeviceReduceMultiBlockAtomicAdd
} }
else else
{ {
const auto toReduceDimLengths = const auto reduceDimLengths =
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
const auto invariantDimLengths = const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
...@@ -92,7 +97,7 @@ struct DeviceReduceMultiBlockAtomicAdd ...@@ -92,7 +97,7 @@ struct DeviceReduceMultiBlockAtomicAdd
return transform_tensor_descriptor( return transform_tensor_descriptor(
inDesc, inDesc,
make_tuple(make_merge_transform(invariantDimLengths), make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)), make_merge_transform(reduceDimLengths)),
make_tuple(InvariantDims{}, ReduceDims{}), make_tuple(InvariantDims{}, ReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
} }
...@@ -147,6 +152,7 @@ struct DeviceReduceMultiBlockAtomicAdd ...@@ -147,6 +152,7 @@ struct DeviceReduceMultiBlockAtomicAdd
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const InDataType* in_dev, const InDataType* in_dev,
...@@ -155,31 +161,31 @@ struct DeviceReduceMultiBlockAtomicAdd ...@@ -155,31 +161,31 @@ struct DeviceReduceMultiBlockAtomicAdd
AccDataType* workspace_dev, AccDataType* workspace_dev,
const InElementwiseOperation& in_elementwise_op, const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op) const AccElementwiseOperation& acc_elementwise_op)
: in_dev_{in_dev}, out_dev_{out_dev} : outLengths_{outLengths},
outStrides_{outStrides},
in_dev_{in_dev},
out_dev_{out_dev},
in_elementwise_op_{in_elementwise_op},
acc_elementwise_op_{acc_elementwise_op}
{ {
(void)out_indices_dev; (void)out_indices_dev;
(void)workspace_dev; (void)workspace_dev;
inLengths_ = inLengths; std::tie(inLengths_, inStrides_) =
inStrides_ = inStrides; shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
outLengths_ = outLengths;
outStrides_ = outStrides;
in_elementwise_op_ = in_elementwise_op;
acc_elementwise_op_ = acc_elementwise_op;
alpha_ = static_cast<AccDataType>(alpha); alpha_ = static_cast<AccDataType>(alpha);
beta_ = static_cast<OutDataType>(beta); beta_ = static_cast<OutDataType>(beta);
std::tie(invariant_total_length, reduce_total_length) = std::tie(invariant_total_length, reduce_total_length) =
get_2d_lengths<Rank, ReduceDims>(inLengths); get_2d_lengths<Rank, ReduceDims>(inLengths_);
if constexpr(InvariantDims::Size() == 0) if constexpr(InvariantDims::Size() == 0)
invariant_lowest_length = 1; invariant_lowest_length = 1;
else else
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)]; invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)]; reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
int iterations = 1; int iterations = 1;
while(true) while(true)
...@@ -369,6 +375,7 @@ struct DeviceReduceMultiBlockAtomicAdd ...@@ -369,6 +375,7 @@ struct DeviceReduceMultiBlockAtomicAdd
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const void* in_dev, const void* in_dev,
...@@ -382,6 +389,7 @@ struct DeviceReduceMultiBlockAtomicAdd ...@@ -382,6 +389,7 @@ struct DeviceReduceMultiBlockAtomicAdd
inStrides, inStrides,
outLengths, outLengths,
outStrides, outStrides,
reduceDims,
alpha, alpha,
beta, beta,
static_cast<const InDataType*>(in_dev), static_cast<const InDataType*>(in_dev),
......
...@@ -15,8 +15,8 @@ namespace device { ...@@ -15,8 +15,8 @@ namespace device {
template <typename InDataType, template <typename InDataType,
typename AccDataType, typename AccDataType,
typename OutDataType, typename OutDataType,
int Rank, index_t Rank,
typename ReduceDims, index_t NumReduceDim,
typename ReduceOperation, typename ReduceOperation,
typename InElementwiseOperation, typename InElementwiseOperation,
typename AccElementwiseOperation, typename AccElementwiseOperation,
...@@ -41,7 +41,12 @@ struct DeviceReduceMultiBlockPartialReduce ...@@ -41,7 +41,12 @@ struct DeviceReduceMultiBlockPartialReduce
using IndexDataType = int32_t; using IndexDataType = int32_t;
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>()); static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
using InvariantDims =
typename conditional<NumInvariantDim == 0,
Sequence<>,
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
static constexpr index_t srcDims = Rank; static constexpr index_t srcDims = Rank;
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
...@@ -112,7 +117,7 @@ struct DeviceReduceMultiBlockPartialReduce ...@@ -112,7 +117,7 @@ struct DeviceReduceMultiBlockPartialReduce
} }
else else
{ {
const auto toReduceDimLengths = const auto reduceDimLengths =
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
const auto invariantDimLengths = const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
...@@ -120,7 +125,7 @@ struct DeviceReduceMultiBlockPartialReduce ...@@ -120,7 +125,7 @@ struct DeviceReduceMultiBlockPartialReduce
return transform_tensor_descriptor( return transform_tensor_descriptor(
inDesc, inDesc,
make_tuple(make_merge_transform(invariantDimLengths), make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)), make_merge_transform(reduceDimLengths)),
make_tuple(InvariantDims{}, ReduceDims{}), make_tuple(InvariantDims{}, ReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
} }
...@@ -161,10 +166,11 @@ struct DeviceReduceMultiBlockPartialReduce ...@@ -161,10 +166,11 @@ struct DeviceReduceMultiBlockPartialReduce
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(const std::vector<index_t>& inLengths, Argument(const std::vector<int>& inLengths,
const std::vector<index_t>& inStrides, const std::vector<int>& inStrides,
const std::vector<index_t>& outLengths, const std::vector<int>& outLengths,
const std::vector<index_t>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const InDataType* in_dev, const InDataType* in_dev,
...@@ -173,31 +179,30 @@ struct DeviceReduceMultiBlockPartialReduce ...@@ -173,31 +179,30 @@ struct DeviceReduceMultiBlockPartialReduce
AccDataType* workspace_dev, AccDataType* workspace_dev,
const InElementwiseOperation& in_elementwise_op, const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op) const AccElementwiseOperation& acc_elementwise_op)
: in_dev_{in_dev}, : outLengths_{outLengths},
outStrides_{outStrides},
in_dev_{in_dev},
out_dev_{out_dev}, out_dev_{out_dev},
out_indices_dev_{out_indices_dev}, out_indices_dev_{out_indices_dev},
workspace_dev_{workspace_dev} workspace_dev_{workspace_dev},
in_elementwise_op_{in_elementwise_op},
acc_elementwise_op_{acc_elementwise_op}
{ {
inLengths_ = inLengths; std::tie(inLengths_, inStrides_) =
inStrides_ = inStrides; shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
outLengths_ = outLengths;
outStrides_ = outStrides;
in_elementwise_op_ = in_elementwise_op;
acc_elementwise_op_ = acc_elementwise_op;
alpha_ = static_cast<AccDataType>(alpha); alpha_ = static_cast<AccDataType>(alpha);
beta_ = static_cast<OutDataType>(beta); beta_ = static_cast<OutDataType>(beta);
std::tie(invariant_total_length, reduce_total_length) = std::tie(invariant_total_length, reduce_total_length) =
get_2d_lengths<Rank, ReduceDims>(inLengths); get_2d_lengths<Rank, ReduceDims>(inLengths_);
if constexpr(InvariantDims::Size() == 0) if constexpr(InvariantDims::Size() == 0)
invariant_lowest_length = 1; invariant_lowest_length = 1;
else else
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)]; invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)]; reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
int iterations = 1; int iterations = 1;
while(true) while(true)
...@@ -370,6 +375,7 @@ struct DeviceReduceMultiBlockPartialReduce ...@@ -370,6 +375,7 @@ struct DeviceReduceMultiBlockPartialReduce
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const void* in_dev, const void* in_dev,
...@@ -383,6 +389,7 @@ struct DeviceReduceMultiBlockPartialReduce ...@@ -383,6 +389,7 @@ struct DeviceReduceMultiBlockPartialReduce
inStrides, inStrides,
outLengths, outLengths,
outStrides, outStrides,
reduceDims,
alpha, alpha,
beta, beta,
static_cast<const InDataType*>(in_dev), static_cast<const InDataType*>(in_dev),
......
...@@ -16,7 +16,7 @@ template <typename InDataType, ...@@ -16,7 +16,7 @@ template <typename InDataType,
typename AccDataType, typename AccDataType,
typename OutDataType, typename OutDataType,
index_t Rank, index_t Rank,
typename ReduceDims, index_t NumReduceDim,
typename ReduceOperation, typename ReduceOperation,
typename InElementwiseOperation, typename InElementwiseOperation,
typename OutElementwiseOperation, typename OutElementwiseOperation,
...@@ -40,7 +40,12 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE ...@@ -40,7 +40,12 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
static constexpr bool BetaIsZero = NeedIndices; static constexpr bool BetaIsZero = NeedIndices;
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>()); static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
using InvariantDims =
typename conditional<NumInvariantDim == 0,
Sequence<>,
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
static constexpr index_t srcDims = Rank; static constexpr index_t srcDims = Rank;
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
...@@ -74,7 +79,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE ...@@ -74,7 +79,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
} }
else else
{ {
const auto toReduceDimLengths = const auto reduceDimLengths =
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
const auto invariantDimLengths = const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
...@@ -82,7 +87,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE ...@@ -82,7 +87,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
return transform_tensor_descriptor( return transform_tensor_descriptor(
inDesc, inDesc,
make_tuple(make_merge_transform(invariantDimLengths), make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)), make_merge_transform(reduceDimLengths)),
make_tuple(InvariantDims{}, ReduceDims{}), make_tuple(InvariantDims{}, ReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
} }
...@@ -136,6 +141,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE ...@@ -136,6 +141,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const InDataType* in_dev, const InDataType* in_dev,
...@@ -144,30 +150,32 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE ...@@ -144,30 +150,32 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
AccDataType* workspace_dev, AccDataType* workspace_dev,
const InElementwiseOperation& in_elementwise_op, const InElementwiseOperation& in_elementwise_op,
const OutElementwiseOperation& acc_elementwise_op) const OutElementwiseOperation& acc_elementwise_op)
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev} : outLengths_{outLengths},
outStrides_{outStrides},
in_dev_{in_dev},
out_dev_{out_dev},
out_indices_dev_{out_indices_dev},
in_elementwise_op_{in_elementwise_op},
acc_elementwise_op_{acc_elementwise_op}
{ {
(void)workspace_dev; (void)workspace_dev;
inLengths_ = inLengths; std::tie(inLengths_, inStrides_) =
inStrides_ = inStrides; shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
outLengths_ = outLengths;
outStrides_ = outStrides;
in_elementwise_op_ = in_elementwise_op;
acc_elementwise_op_ = acc_elementwise_op;
alpha_ = static_cast<AccDataType>(alpha); alpha_ = static_cast<AccDataType>(alpha);
beta_ = static_cast<OutDataType>(beta); beta_ = static_cast<OutDataType>(beta);
std::tie(invariant_total_length, reduce_total_length) = std::tie(invariant_total_length, reduce_total_length) =
get_2d_lengths<Rank, ReduceDims>(inLengths); get_2d_lengths<Rank, ReduceDims>(inLengths_);
if constexpr(InvariantDims::Size() == 0) if constexpr(InvariantDims::Size() == 0)
invariant_lowest_length = 1; invariant_lowest_length = 1;
else else
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)]; invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)]; reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
M_BlockTileSize; M_BlockTileSize;
...@@ -306,6 +314,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE ...@@ -306,6 +314,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
const std::vector<int>& inStrides, const std::vector<int>& inStrides,
const std::vector<int>& outLengths, const std::vector<int>& outLengths,
const std::vector<int>& outStrides, const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta, float beta,
const void* in_dev, const void* in_dev,
...@@ -319,6 +328,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE ...@@ -319,6 +328,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
inStrides, inStrides,
outLengths, outLengths,
outStrides, outStrides,
reduceDims,
alpha, alpha,
beta, beta,
static_cast<const InDataType*>(in_dev), static_cast<const InDataType*>(in_dev),
......
...@@ -86,22 +86,34 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add ...@@ -86,22 +86,34 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
{ {
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
static constexpr auto buffer_1d_desc = using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
make_naive_tensor_descriptor_packed(make_tuple(Number<BlockSize>{}));
using ThreadBufferDimAccessOrder =
using blockwise_reduce = PartitionedBlockwiseReductionOn1dBuffer<decltype(buffer_1d_desc), typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
AccDataType,
BlockSize, using ThreadClusterArrangeOrder =
MThreadClusterSize, typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
KThreadClusterSize,
reorder_thread_cluster, static constexpr auto thread_cluster_desc =
ReduceOperation, make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
PropagateNan>;
// For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the
// Dim_K as the fastest one
static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadClusterSize>{}, Number<KThreadClusterSize>{}));
using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
BlockSize,
ThreadClusterLengths_M_K,
ThreadClusterArrangeOrder,
ReduceOperation,
PropagateNan>;
template <typename T> template <typename T>
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>; using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
...@@ -145,12 +157,12 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add ...@@ -145,12 +157,12 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
const index_t block_global_id = get_block_1d_id(); const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / block_group_size; const index_t blkgroup_id = block_global_id / block_group_size;
const index_t block_local_id = block_global_id % block_group_size; const index_t block_local_id = block_global_id % block_group_size;
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize const auto thread_cluster_idx =
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) const auto thread_m_cluster_id = thread_cluster_idx[I0];
: thread_local_id % KThreadClusterSize; const auto thread_k_cluster_id = thread_cluster_idx[I1];
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
...@@ -158,17 +170,16 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add ...@@ -158,17 +170,16 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})); make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType, AccDataType,
AccDataType, InGridDesc_M_K,
InGridDesc_M_K, decltype(thread_buffer_desc),
decltype(thread_buffer_desc), ThreadBufferLengths,
ThreadBufferLengths, ThreadBufferDimAccessOrder,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type, InSrcVectorDim,
InSrcVectorDim, InSrcVectorSize,
InSrcVectorSize, 1,
1, false>(
false>(
in_grid_desc_m_k, in_grid_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
block_local_id * reduceSizePerBlock + block_local_id * reduceSizePerBlock +
...@@ -212,21 +223,14 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add ...@@ -212,21 +223,14 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
// consistent reduced result for that invariant dimension. due to the using of vector_load, // consistent reduced result for that invariant dimension. due to the using of vector_load,
// each block/thread is involved into multiple invarirant dimensions. // each block/thread is involved into multiple invarirant dimensions.
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if constexpr(reorder_thread_cluster) block_reduce_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) =
{ accu_value_buf[I];
block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) =
accu_value_buf[I];
}
else
block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) =
accu_value_buf[I];
accu_value_buf(I) = zeroVal; accu_value_buf(I) = zeroVal;
__syncthreads(); __syncthreads();
blockwise_reduce::Reduce( BlockwiseReduce::Reduce(block_reduce_buf, accu_value_buf(I));
block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id);
}); });
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
......
...@@ -30,8 +30,8 @@ ...@@ -30,8 +30,8 @@
#include "reduction_operator.hpp" #include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp" #include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp" #include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp" #include "threadwise_tensor_slice_transfer.hpp"
#include "cluster_descriptor.hpp"
namespace ck { namespace ck {
...@@ -103,13 +103,27 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -103,13 +103,27 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
{ {
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
static constexpr auto buffer1dDesc = using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
make_naive_tensor_descriptor_packed(make_tuple(Number<BlockSize>{}));
using ThreadBufferDimAccessOrder =
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
using ThreadClusterArrangeOrder =
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
static constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
// For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the
// Dim_K as the fastest one
static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadClusterSize>{}, Number<KThreadClusterSize>{}));
template <typename T> template <typename T>
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>; using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
...@@ -124,14 +138,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -124,14 +138,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
AccDataType* const __restrict__ p_ws_values_global, AccDataType* const __restrict__ p_ws_values_global,
IndexDataType* const __restrict__ p_ws_indices_global) IndexDataType* const __restrict__ p_ws_indices_global)
{ {
using BlockwiseReduce = PartitionedBlockwiseReductionOn1dBuffer<decltype(buffer1dDesc), using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
AccDataType, BlockSize,
BlockSize, ThreadClusterLengths_M_K,
MThreadClusterSize, ThreadClusterArrangeOrder,
KThreadClusterSize, ReduceOperation,
reorder_thread_cluster, PropagateNan>;
ReduceOperation,
PropagateNan>;
using Accumulation = using Accumulation =
detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>; detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
...@@ -168,12 +180,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -168,12 +180,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
const index_t block_global_id = get_block_1d_id(); const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / block_group_size; const index_t blkgroup_id = block_global_id / block_group_size;
const index_t block_local_id = block_global_id % block_group_size; const index_t block_local_id = block_global_id % block_group_size;
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize const auto thread_cluster_idx =
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) const auto thread_m_cluster_id = thread_cluster_idx[I0];
: thread_local_id % KThreadClusterSize; const auto thread_k_cluster_id = thread_cluster_idx[I1];
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
...@@ -181,17 +193,16 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -181,17 +193,16 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})); make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType, AccDataType,
AccDataType, InGridDesc_M_K,
InGridDesc_M_K, decltype(thread_buffer_desc),
decltype(thread_buffer_desc), ThreadBufferLengths,
ThreadBufferLengths, ThreadBufferDimAccessOrder,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type, InSrcVectorDim,
InSrcVectorDim, InSrcVectorSize,
InSrcVectorSize, 1,
1, false>(
false>(
in_grid_desc_m_k, in_grid_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
block_local_id * reduceSizePerBlock + block_local_id * reduceSizePerBlock +
...@@ -233,21 +244,14 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -233,21 +244,14 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
// Each block executes multiple parallel reductions on the LDS, and due to the using of // Each block executes multiple parallel reductions on the LDS, and due to the using of
// vector_load, each block/thread is involved into multiple invarirant dimensions. // vector_load, each block/thread is involved into multiple invarirant dimensions.
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if constexpr(reorder_thread_cluster) block_reduce_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) =
{ accu_value_buf[I];
block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) =
accu_value_buf[I];
}
else
block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) =
accu_value_buf[I];
accu_value_buf(I) = zeroVal; accu_value_buf(I) = zeroVal;
__syncthreads(); __syncthreads();
BlockwiseReduce::Reduce( BlockwiseReduce::Reduce(block_reduce_buf, accu_value_buf(I));
block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id);
}); });
if(thread_k_cluster_id == 0) if(thread_k_cluster_id == 0)
...@@ -290,15 +294,13 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -290,15 +294,13 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
IndexDataType* const __restrict__ p_ws_indices_global) IndexDataType* const __restrict__ p_ws_indices_global)
{ {
using BlockwiseReduceWithIndex = using BlockwiseReduceWithIndex =
PartitionedBlockwiseReductionWithIndexOn1dBuffer<decltype(buffer1dDesc), PartitionedBlockwiseReductionWithIndex<AccDataType,
AccDataType, IndexDataType,
IndexDataType, BlockSize,
BlockSize, ThreadClusterLengths_M_K,
MThreadClusterSize, ThreadClusterArrangeOrder,
KThreadClusterSize, ReduceOperation,
reorder_thread_cluster, PropagateNan>;
ReduceOperation,
PropagateNan>;
using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan, using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
ReduceOperation, ReduceOperation,
...@@ -346,12 +348,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -346,12 +348,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
const index_t block_global_id = get_block_1d_id(); const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / block_group_size; const index_t blkgroup_id = block_global_id / block_group_size;
const index_t block_local_id = block_global_id % block_group_size; const index_t block_local_id = block_global_id % block_group_size;
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize const auto thread_cluster_idx =
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) const auto thread_m_cluster_id = thread_cluster_idx[I0];
: thread_local_id % KThreadClusterSize; const auto thread_k_cluster_id = thread_cluster_idx[I1];
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
...@@ -359,17 +361,16 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -359,17 +361,16 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})); make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType, AccDataType,
AccDataType, InGridDesc_M_K,
InGridDesc_M_K, decltype(thread_buffer_desc),
decltype(thread_buffer_desc), ThreadBufferLengths,
ThreadBufferLengths, ThreadBufferDimAccessOrder,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type, InSrcVectorDim,
InSrcVectorDim, InSrcVectorSize,
InSrcVectorSize, 1,
1, false>(
false>(
in_grid_desc_m_k, in_grid_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
block_local_id * reduceSizePerBlock + block_local_id * reduceSizePerBlock +
...@@ -418,29 +419,15 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ...@@ -418,29 +419,15 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
}); });
// store thread local value to LDS for parallel reduction // store thread local value to LDS for parallel reduction
if constexpr(reorder_thread_cluster) block_reduce_val_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) =
{ tmpValue;
block_reduce_val_buf(thread_k_cluster_id * MThreadClusterSize + block_reduce_idx_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) =
thread_m_cluster_id) = tmpValue; tmpIndex;
block_reduce_idx_buf(thread_k_cluster_id * MThreadClusterSize +
thread_m_cluster_id) = tmpIndex;
}
else
{
block_reduce_val_buf(thread_m_cluster_id * KThreadClusterSize +
thread_k_cluster_id) = tmpValue;
block_reduce_idx_buf(thread_m_cluster_id * KThreadClusterSize +
thread_k_cluster_id) = tmpIndex;
}
__syncthreads(); __syncthreads();
BlockwiseReduceWithIndex::Reduce(block_reduce_val_buf, BlockwiseReduceWithIndex::Reduce(
block_reduce_idx_buf, block_reduce_val_buf, block_reduce_idx_buf, tmpValue, tmpIndex);
tmpValue,
tmpIndex,
thread_m_cluster_id,
thread_k_cluster_id);
AccumulationWithIndex::Calculate( AccumulationWithIndex::Calculate(
accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex); accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex);
......
...@@ -101,6 +101,9 @@ template <typename InDataType, ...@@ -101,6 +101,9 @@ template <typename InDataType,
index_t OutDstVectorSize> index_t OutDstVectorSize>
struct GridwiseReduction_mk_to_m_threadwise struct GridwiseReduction_mk_to_m_threadwise
{ {
using ThreadBufferDimAccessOrder =
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type;
template <typename T> template <typename T>
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>; using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
...@@ -147,17 +150,17 @@ struct GridwiseReduction_mk_to_m_threadwise ...@@ -147,17 +150,17 @@ struct GridwiseReduction_mk_to_m_threadwise
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType, AccDataType,
AccDataType, InGridDesc_M_K,
InGridDesc_M_K, decltype(thread_buffer_desc),
decltype(thread_buffer_desc), ThreadBufferLengths,
ThreadBufferLengths, ThreadBufferDimAccessOrder,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type, InSrcVectorDim,
InSrcVectorDim, InSrcVectorSize,
InSrcVectorSize, 1,
1, false>(
false>(in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize); constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
...@@ -299,17 +302,17 @@ struct GridwiseReduction_mk_to_m_threadwise ...@@ -299,17 +302,17 @@ struct GridwiseReduction_mk_to_m_threadwise
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
InDataType, AccDataType,
AccDataType, InGridDesc_M_K,
InGridDesc_M_K, decltype(thread_buffer_desc),
decltype(thread_buffer_desc), ThreadBufferLengths,
ThreadBufferLengths, ThreadBufferDimAccessOrder,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type, InSrcVectorDim,
InSrcVectorDim, InSrcVectorSize,
InSrcVectorSize, 1,
1, false>(
false>(in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize); constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
......
...@@ -57,7 +57,7 @@ template <typename InDataType, ...@@ -57,7 +57,7 @@ template <typename InDataType,
typename AccDataType, typename AccDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
typename ReduceDims, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp_t ReduceOpId,
NanPropagation_t NanOpt, NanPropagation_t NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices_t IndicesOpt>
...@@ -91,7 +91,7 @@ void add_device_reduce_instance_blockwise( ...@@ -91,7 +91,7 @@ void add_device_reduce_instance_blockwise(
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
ReduceDims, NumReduceDim,
ReduceOperation, ReduceOperation,
InElementwiseOperation, InElementwiseOperation,
AccElementwiseOperation, AccElementwiseOperation,
...@@ -112,34 +112,36 @@ void add_device_reduce_instance_blockwise( ...@@ -112,34 +112,36 @@ void add_device_reduce_instance_blockwise(
}); });
}; };
#define ADD_BLOCKWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ #define ADD_BLOCKWISE_INST_BY_TYPE( \
template void add_device_reduce_instance_blockwise<inT, \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
compT, \ template void add_device_reduce_instance_blockwise<inT, \
outT, \ compT, \
Rank, \ outT, \
Sequence<__VA_ARGS__>, \ Rank, \
ReduceOpId, \ NumReduceDim, \
NanOpt, \ ReduceOpId, \
IndicesOpt>( \ NanOpt, \
IndicesOpt>( \
std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances) std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_BLOCKWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ #define ADD_BLOCKWISE_INST_BY_ID( \
ADD_BLOCKWISE_INST_BY_TYPE(inT, \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
compT, \ ADD_BLOCKWISE_INST_BY_TYPE(inT, \
outT, \ compT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ outT, \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<ReduceTensorOp_t>(ReduceOpId), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<NanPropagation_t>(NanOpt), \
Rank, \ static_cast<ReduceTensorIndices_t>(IndicesOpt), \
__VA_ARGS__) Rank, \
NumReduceDim)
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ #define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_blockwise<inT, \ extern template void add_device_reduce_instance_blockwise<inT, \
compT, \ compT, \
outT, \ outT, \
Rank, \ Rank, \
Sequence<__VA_ARGS__>, \ NumReduceDim, \
ReduceOpId, \ ReduceOpId, \
NanOpt, \ NanOpt, \
IndicesOpt>( \ IndicesOpt>( \
...@@ -149,15 +151,16 @@ void add_device_reduce_instance_blockwise( ...@@ -149,15 +151,16 @@ void add_device_reduce_instance_blockwise(
AccElementwiseOperation>> & \ AccElementwiseOperation>> & \
device_op_instances) device_op_instances)
#define ADD_BLOCKWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ #define ADD_BLOCKWISE_INST_REF_BY_ID( \
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
compT, \ ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
outT, \ compT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ outT, \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<ReduceTensorOp_t>(ReduceOpId), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<NanPropagation_t>(NanOpt), \
Rank, \ static_cast<ReduceTensorIndices_t>(IndicesOpt), \
__VA_ARGS__) Rank, \
NumReduceDim)
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
......
...@@ -11,25 +11,25 @@ namespace device { ...@@ -11,25 +11,25 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -11,16 +11,16 @@ namespace device { ...@@ -11,16 +11,16 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -11,34 +11,34 @@ namespace device { ...@@ -11,34 +11,34 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -11,16 +11,16 @@ namespace device { ...@@ -11,16 +11,16 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0); // ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment