Unverified Commit e17c0d80 authored by Qianfeng's avatar Qianfeng Committed by GitHub
Browse files

Reduction in Composable Kernel (#82)



* Initial adding of generic reduction

* Initial adding of generic reduction ...

* Updates to make compiling done

* clang-format all files

* clang-format some files again

* Renaming in profiler/include/profile_reduce.hpp

* Updates and make BlockWise cases passed

* Updates and make ThreadWise and MultiBlockTwoCall cases passed

* Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances

* Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes

* format

* adding pooling

* added max and average pooling

* comment out cout and kernel timing

* Tiny simplification in profiler/reduce_profiler.cpp

* Add example for reduce_blockwise

* Tiny updates

* Change to pass the ElementWiseOp from device layer to kernel

* Fix the vectorDim and vectorSize in Device layer

* Enable vector load on both dim0 and dim1 for Threadwise method

* Tiny updates

* Change to let the user to pass the preUnaryOp and posUnaryOp

* Make pooling example work

* split device_reduce_instance into two libraries

* Tiny update

* Replace nanPropaOpt enum by boolean propagate_nan

* Simplification in DeviceReduce layer codes

* update build

* Change to clarify the difference between ck::half_t and half_float::half

* Renaming in all the reduction codes

* Add VectorSize as template parameter for device layer

* Add BetaIsZero as kernel template and as AccDataType for alpha

* print

* Small updates for pooling

* Updates for host_generic_reduction for reference

* Update to make AVG pooling pass

* Update to make MAX pooling with indices output pass

* fix

* add OutDst vector store to threadwise reduction and pooling

* tweak

* turn off check_indices that caused build issue

* refactor pooling

* clean up

* turn off check_indices for building issue for php-compiler

* add more tile size for odd C

* tweak conv for odd C

* update script

* clean up elementwise op

* add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op

* Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations

* Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp

* Change to the unary operators

* Move the definitions of unary operations to element_wise_operation.hpp

* re-org files

* Refine in device interfaces and multiblock kernels

* Split the reduction configurations into instances for specific methods

* Update in getTypeString() of device pool2d

* Renaming in host and kernel

* Tiny update in profiler/src/profiler.cpp

* Uncomment in device_operation/CMakeLists.txt to enable the building of all operations

* Make check_indices a templated function to remove some linking issue

* Renaming in the profiler reduce module

* Add support for double Reduction (but disable MultiblockAtomicAdd for double)

* Tiny correction of literal string

* Rename DevicePoolFwd to DevicePool2dFwd

* Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling

* Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx

* Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp

* Renaming and refining in the Reduction codes

* Tiny change in the unary operators

* Renaming symbols and files

* Renaming symbols in the kernels

* Move kernel kernel_set_buffer_value to separate file

* Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer

* Tiny update in the kernels

* Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue

* Simplify a helper function in device layer

* Tiny adjustment in testing data initialization

* Renaming in kernel/device/host

* Add two testing scripts for reduction

* Refine the Unary operators in element_wise_operation.hpp

* Update in the reduce profiler module

* Update to the reduction testing scripts

* reduce compile parallelism

* change CI docker to rocm5.0

* remove unused variables

* fix build
Co-authored-by: default avatarChao Liu <chao.liu2@amd.com>
parent 12dfba3d
FROM ubuntu:18.04
ARG ROCMVERSION=4.3.1
ARG ROCMVERSION=5.0
ARG OSDB_BKC_VERSION
RUN set -xe
......
......@@ -175,6 +175,161 @@ struct RequantReluRequant
float scaleRelu_;
};
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template <typename Y, typename X, bool HasDividing = false>
struct UnaryIdentic;
template <>
struct UnaryIdentic<float, float, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = x; };
};
template <>
struct UnaryIdentic<float, float, true>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(float& y, const float& x) const
{
y = x / type_convert<float>(divider_);
};
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<half_t, half_t, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; };
};
template <>
struct UnaryIdentic<double, double, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(double& y, const double& x) const { y = x; };
};
template <>
struct UnaryIdentic<double, double, true>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(double& y, const double& x) const
{
y = x / type_convert<double>(divider_);
};
int32_t divider_ = 1;
};
template <>
struct UnaryIdentic<int32_t, int32_t, false>
{
__host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; };
};
template <typename Y, typename X, bool HasDividing = false>
struct UnarySquare;
template <>
struct UnarySquare<float, float, false>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = x * x; };
};
template <>
struct UnarySquare<float, float, true>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(float& y, const float& x) const
{
y = x * x / type_convert<float>(divider_);
};
int32_t divider_ = 1;
};
template <>
struct UnarySquare<double, double, false>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(double& y, const double& x) const { y = x * x; };
};
template <>
struct UnarySquare<double, double, true>
{
__host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; };
__host__ __device__ void operator()(double& y, const double& x) const
{
y = x * x / type_convert<double>(divider_);
};
int32_t divider_ = 1;
};
template <typename Y, typename X>
struct UnaryAbs;
template <>
struct UnaryAbs<float, float>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = abs(x); };
};
template <>
struct UnaryAbs<half_t, half_t>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(half_t& y, const half_t& x) const { y = __habs(x); };
};
template <>
struct UnaryAbs<double, double>
{
__host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); };
};
template <typename Y, typename X>
struct UnarySqrt;
template <>
struct UnarySqrt<float, float>
{
__host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(float& y, const float& x) const { y = sqrtf(x); };
};
template <>
struct UnarySqrt<double, double>
{
__host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
__host__ __device__ void operator()(double& y, const double& x) const { y = sqrt(x); };
};
} // namespace element_wise
} // namespace tensor_operation
} // namespace ck
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP
#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <typename GridwiseReduction,
typename InDataType,
typename OutDataType,
typename AccDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename InElementwiseOperation,
typename AccElementwiseOperation>
__global__ void
kernel_reduce_multiblock_atocmi_add(const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const AccElementwiseOperation acc_elementwise_op,
index_t block_group_size,
index_t num_k_block_tile_iteration,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType* const __restrict__ p_out_global)
{
GridwiseReduction::Run(in_grid_desc_m_k,
out_grid_desc_m,
in_elementwise_op,
acc_elementwise_op,
block_group_size,
num_k_block_tile_iteration,
alpha,
p_in_global,
p_out_global);
};
template <typename InDataType,
typename OutDataType,
typename AccDataType,
typename InGridDesc_M_K,
typename OutGridDesc_M,
typename ReduceOperation,
typename InElementwiseOperation,
typename AccElementwiseOperation,
bool PropagateNan,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t InSrcVectorDim,
index_t InSrcVectorSize,
index_t OutDstVectorSize>
struct GridwiseReduction_mk_to_m_multiblock_atomic_add
{
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
static constexpr auto buffer_1d_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<BlockSize>{}));
using blockwise_reduce = PartitionedBlockwiseReductionOn1dBuffer<decltype(buffer_1d_desc),
AccDataType,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
reorder_thread_cluster,
ReduceOperation,
PropagateNan>;
template <typename T>
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
static constexpr auto I0 = Number<0>{};
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
__device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
const OutGridDesc_M& out_grid_desc_m,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op,
index_t block_group_size,
index_t num_k_block_tile_iteration,
AccDataType alpha,
const InDataType* const __restrict__ p_in_global,
OutDataType* const __restrict__ p_out_global)
{
const auto zeroVal = ReduceOperation::GetReductionZeroVal();
// LDS
__shared__ AccDataType p_block_reduce_buffer[BlockSize];
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_out_global, out_grid_desc_m.GetElementSpaceSize());
auto block_reduce_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_block_reduce_buffer, BlockSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr,
AccDataType,
MThreadSliceSize * KThreadSliceSize,
true>
in_thread_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / block_group_size;
const index_t block_local_id = block_global_id % block_group_size;
const index_t thread_m_cluster_id =
reorder_thread_cluster ? thread_local_id % MThreadClusterSize
: ((thread_local_id / KThreadClusterSize) % MThreadClusterSize);
const index_t thread_k_cluster_id =
reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize)
: thread_local_id % KThreadClusterSize;
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<
InDataType,
AccDataType,
InGridDesc_M_K,
decltype(thread_buffer_desc),
ThreadBufferLengths,
typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type,
InSrcVectorDim,
InSrcVectorSize,
1,
false>(
in_grid_desc_m_k,
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
block_local_id * reduceSizePerBlock +
thread_k_cluster_id * KThreadSliceSize));
constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
index_t reducedTiles = 0;
do
{
threadwise_src_load.Run(in_grid_desc_m_k,
in_global_buf,
thread_buffer_desc,
make_tuple(I0, I0),
in_thread_buf);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
// do element-wise pre-reduction operation
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset));
});
// reduce on each thread-local slice
static_for<0, KThreadSliceSize, 1>{}([&](auto J) {
constexpr auto offset = I * Number<KThreadSliceSize>{} + J;
Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]);
});
});
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
reducedTiles++;
} while(reducedTiles < num_k_block_tile_iteration);
constexpr auto reduced_data_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
// Each block executes multiple parallel reductions on the LDS, and by atomic-adding its
// reduced output to the global location corresponding to each invariant dimension to get a
// consistent reduced result for that invariant dimension. due to the using of vector_load,
// each block/thread is involved into multiple invarirant dimensions.
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if constexpr(reorder_thread_cluster)
{
block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) =
accu_value_buf[I];
}
else
block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) =
accu_value_buf[I];
accu_value_buf(I) = zeroVal;
__syncthreads();
blockwise_reduce::Reduce(
block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id);
});
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if(thread_k_cluster_id == 0)
{
acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
accu_value_buf(I) *= alpha;
}
});
if(thread_k_cluster_id == 0)
{
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
OutDataType,
decltype(reduced_data_desc),
OutGridDesc_M,
PassThroughOp<AccDataType>,
Sequence<MThreadSliceSize>,
Sequence<0>,
0,
OutDstVectorSize,
InMemoryDataOperationEnum_t::AtomicAdd,
1,
true>(
out_grid_desc_m,
make_multi_index(blkgroup_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize),
PassThroughOp<AccDataType>{});
threadwise_dst_store.Run(
reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_blockwise.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType, // not used together with the beta input
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
index_t GredAccessesPerThreadInBlock>
struct GridwiseReduction_xy_to_x_multiblock
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::preUnaryOp;
using posUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::posUnaryOp;
static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
using blockwise_reduce =
BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global)
{
(void)ws_indices_global;
(void)alpha; // unused
(void)beta; // unused
const auto zeroVal = opReduce::GetReductionZeroVal();
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto in_block_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / BlkGroupSize;
const index_t block_local_id = block_global_id % BlkGroupSize;
const index_t reduceSizePerBlock =
(((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
BlockBufferSize) *
BlockBufferSize;
constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(
src2dDesc,
make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
__syncthreads();
// do element-wise pre-reduction operation
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
const auto workspace_desc =
make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
auto threadwise_workspace_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
srcDataType,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
threadwise_workspace_store.Run(ReducedDataDesc,
make_tuple(I0),
accuValue_buf,
workspace_desc,
workspace_global_buf);
}
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global)
{
(void)alpha; // unused
(void)beta; // unused
const auto zeroVal = opReduce::GetReductionZeroVal();
// LDS
__shared__ compType p_in_block_values_buffer[BlockBufferSize];
__shared__ int p_in_block_indices_buffer[BlockBufferSize];
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto in_block_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_values_buffer, BlockBufferSize);
auto in_block_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
p_in_block_indices_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / BlkGroupSize;
const index_t block_local_id = block_global_id % BlkGroupSize;
const index_t reduceSizePerBlock =
(((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
BlockBufferSize) *
BlockBufferSize;
constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(
src2dDesc,
make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
int indexOffset = block_local_id * reduceSizePerBlock;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
__syncthreads();
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce2(in_block_val_buf,
in_block_idx_buf,
BlocksInOneOp,
accuValue_buf(I0),
accuIndex_buf(I0));
indexOffset += BlockBufferSize;
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
const auto workspace_desc =
make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
auto threadwise_workspace_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
srcDataType,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
auto threadwise_workspace_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
threadwise_workspace_val_store.Run(ReducedDataDesc,
make_tuple(I0),
accuValue_buf,
workspace_desc,
workspace_global_val_buf);
threadwise_workspace_idx_store.Run(ReducedDataDesc,
make_tuple(I0),
accuIndex_buf,
workspace_desc,
workspace_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_SET_BUFFER_VALUE_HPP
#define CK_GRIDWISE_SET_BUFFER_VALUE_HPP
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize, typename DataType, typename Grid1dBufferDescType>
__global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffer_desc,
DataType* const __restrict__ p_global,
DataType value)
{
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<DataType, DataType>;
constexpr auto I0 = Number<0>{};
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t thread_global_id = block_global_id * BlockSize + thread_local_id;
StaticBuffer<AddressSpaceEnum_t::Vgpr, DataType, 1, true> value_buf;
value_buf(I0) = value;
constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
auto global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_global, grid_1d_buffer_desc.GetElementSpaceSize());
if(thread_global_id < grid_1d_buffer_desc.GetElementSize())
{
auto threadwise_store = ThreadwiseTensorSliceTransfer_v1r3<DataType,
DataType,
decltype(val_buff_desc),
Grid1dBufferDescType,
PassThroughOp,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(
grid_1d_buffer_desc, make_multi_index(thread_global_id), PassThroughOp{});
threadwise_store.Run(
val_buff_desc, make_tuple(I0), value_buf, grid_1d_buffer_desc, global_buf);
}
};
} // namespace ck
#endif
......@@ -30,240 +30,154 @@
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
#include "reduction_functions_accumulate.hpp"
namespace ck {
template <typename buffer2dDescType,
bool blockIsOneRow,
typename opReduce,
NanPropagation_t nanPropaOpt>
struct BlockwiseReduction_2d_block_buffer
template <typename Buffer1dDescType,
typename AccDataType,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
bool ReorderThreadClusters,
typename OpReduce,
bool PropagateNan>
struct PartitionedBlockwiseReductionOn1dBuffer
{
using compType = typename opReduce::dataType;
static constexpr auto buffer_1d_desc = Buffer1dDescType{};
static constexpr auto buffer2dDesc = buffer2dDescType{};
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
"The product of cluster lengths should be same as BlockSize!");
static_assert(KThreadClusterSize > 1, "Parallel reduction need work on at least two elements");
static constexpr index_t BlockSize =
blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{});
static constexpr index_t NumBlocks =
blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{});
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
static_assert(buffer_1d_desc.GetElementSize() == BlockSize,
"The buffer size should be the same as BlockSize!");
// This interface does not accumulate on indices
template <typename BufferType>
__device__ static void
Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
{
const index_t thread_local_id = get_thread_local_1d_id();
compType lAccuData = opReduce::GetReductionZeroVal();
using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
index_t offset;
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
template <typename BufferType>
__device__ static void Reduce(BufferType& block_buffer,
AccDataType& accuData,
index_t thread_m_cluster_id,
index_t thread_k_cluster_id)
{
offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
compType opData = type_convert<compType>(block_buffer[offset]);
binop::calculate(lAccuData, opData);
}
offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
constexpr auto cluster_len_shift = get_shift<KThreadClusterSize>();
block_buffer(offset) = lAccuData;
static_for<0, cluster_len_shift, 1>{}([&](auto I) {
constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I());
__syncthreads();
for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2)
{
if(thread_local_id < indOffset)
if(thread_k_cluster_id < indOffset)
{
// consider the thread clusters order, ensure the contiguous locations are accessed
// by contiguous Thread-ID
index_t offset1 =
blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
index_t offset2 =
blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
compType opData1 = type_convert<compType>(block_buffer[offset1]);
compType opData2 = type_convert<compType>(block_buffer[offset2]);
binop::calculate(opData1, opData2);
block_buffer(offset1) = type_convert<compType>(opData1);
ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(
thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(make_tuple(
thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id));
index_t offset2 = ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(
(thread_k_cluster_id + indOffset) * MThreadClusterSize +
thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(
make_tuple(thread_m_cluster_id * KThreadClusterSize +
(thread_k_cluster_id + indOffset)));
AccDataType opData1 = type_convert<AccDataType>(block_buffer[offset1]);
AccDataType opData2 = type_convert<AccDataType>(block_buffer[offset2]);
Accumulation::Calculate(opData1, opData2);
block_buffer(offset1) = type_convert<AccDataType>(opData1);
}
__syncthreads();
}
});
if(thread_local_id == 0)
{
compType tmpVal = type_convert<compType>(block_buffer[0]);
index_t offset = ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(
make_tuple(thread_m_cluster_id * KThreadClusterSize));
binop::calculate(accuData, tmpVal);
}
accuData = type_convert<AccDataType>(block_buffer[offset]);
};
};
// This interface accumulates on both data values and indices
template <typename BufferType, typename IdxBufferType>
__device__ static void Reduce2(BufferType& block_buffer,
IdxBufferType& block_indices_buffer,
index_t toReduceBlocks,
compType& accuData,
int& accuIndex)
{
const index_t thread_local_id = get_thread_local_1d_id();
compType lAccuData = opReduce::GetReductionZeroVal();
int lAccuIndex = 0;
if constexpr(blockIsOneRow)
{
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
{
if(thread_local_id % (indOffset * 2) == 0)
{
index_t offset1 =
buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id));
index_t offset2 = buffer2dDesc.CalculateOffset(
make_tuple(otherDimInd, thread_local_id + indOffset));
compType currVal1 = type_convert<compType>(block_buffer[offset1]);
compType currVal2 = type_convert<compType>(block_buffer[offset2]);
int currIndex1 = block_indices_buffer[offset1];
int currIndex2 = block_indices_buffer[offset2];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
block_buffer(offset1) = type_convert<compType>(currVal1);
block_indices_buffer(offset1) = currIndex1;
}
__syncthreads();
}
}
if(thread_local_id == 0)
{
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0));
compType tmpVal = type_convert<compType>(block_buffer[offset]);
int tmpIndex = block_indices_buffer[offset];
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
}
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
}
}
else
{
index_t offset;
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
compType currVal = type_convert<compType>(block_buffer[offset]);
int currIndex = block_indices_buffer[offset];
binop::calculate(lAccuData, currVal, lAccuIndex, currIndex);
}
offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
block_buffer(offset) = lAccuData;
block_indices_buffer(offset) = lAccuIndex;
__syncthreads();
for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
{
if(thread_local_id % (indOffset * 2) == 0)
{
index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
index_t offset2 =
buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
template <typename Buffer1dDescType,
typename AccDataType,
typename IndexDataType,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
bool ReorderThreadClusters,
typename OpReduce,
bool PropagateNan>
struct PartitionedBlockwiseReductionWithIndexOn1dBuffer
{
static constexpr auto buffer_1d_desc = Buffer1dDescType{};
compType currVal1 = type_convert<compType>(block_buffer[offset1]);
compType currVal2 = type_convert<compType>(block_buffer[offset2]);
int currIndex1 = block_indices_buffer[offset1];
int currIndex2 = block_indices_buffer[offset2];
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
"The product of cluster lengths should be same as BlockSize!");
static_assert(KThreadClusterSize > 1, "Parallel reduction need work on at least two elements");
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
block_buffer(offset1) = type_convert<compType>(currVal1);
block_indices_buffer(offset1) = currIndex1;
}
static_assert(buffer_1d_desc.GetElementSize() == BlockSize,
"The buffer size should be the same as BlockSize!");
__syncthreads();
}
using Accumulation =
detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
if(thread_local_id == 0)
// This interface accumulates on both data values and indices
template <typename BufferType, typename IdxBufferType>
__device__ static void Reduce(BufferType& block_val_buffer,
IdxBufferType& block_idx_buffer,
AccDataType& accuData,
IndexDataType& accuIndex,
index_t thread_m_cluster_id,
index_t thread_k_cluster_id)
{
compType tmpVal = type_convert<compType>(block_buffer[0]);
int tmpIndex = block_indices_buffer[0];
binop::calculate(accuData, tmpVal, accuIndex, tmpIndex);
}
}
};
constexpr auto cluster_len_shift = get_shift<KThreadClusterSize>();
template <typename BufferType>
__device__ static void set_buffer_value(BufferType& block_buffer, compType value)
{
index_t thread_id = get_thread_local_1d_id();
static_for<0, cluster_len_shift, 1>{}([&](auto I) {
constexpr index_t indOffset = 1 << I();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
if(thread_k_cluster_id % (indOffset * 2) == 0)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
block_buffer(offset) = value;
__syncthreads();
// consider the thread clusters order, ensure the contiguous locations are accessed
// by contiguous Thread-ID
index_t offset1 =
ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(
thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(make_tuple(
thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id));
index_t offset2 = ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(
(thread_k_cluster_id + indOffset) * MThreadClusterSize +
thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(
make_tuple(thread_m_cluster_id * KThreadClusterSize +
(thread_k_cluster_id + indOffset)));
AccDataType opData1 = type_convert<AccDataType>(block_val_buffer[offset1]);
AccDataType opData2 = type_convert<AccDataType>(block_val_buffer[offset2]);
IndexDataType currIndex1 = block_idx_buffer[offset1];
IndexDataType currIndex2 = block_idx_buffer[offset2];
Accumulation::Calculate(opData1, opData2, currIndex1, currIndex2);
block_val_buffer(offset1) = type_convert<AccDataType>(opData1);
block_idx_buffer(offset1) = currIndex1;
}
};
// Initialize the block-wise indices buffer, the index for each element in the block-wise
// data buffer is calculated according to its position in the buffer and the global starting
// index
template <typename IdxBufferType>
__device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart)
{
index_t thread_id = get_thread_local_1d_id();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
block_indices_buffer(offset) = offset + indexStart;
__syncthreads();
}
};
// Execute unary operation on the block buffer elements
template <typename unary_op_type, typename BufferType>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer)
{
index_t thread_id = get_thread_local_1d_id();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
});
block_buffer(offset) = unary_op(block_buffer[offset]);
index_t offset = ReorderThreadClusters
? buffer_1d_desc.CalculateOffset(make_tuple(thread_m_cluster_id))
: buffer_1d_desc.CalculateOffset(
make_tuple(thread_m_cluster_id * KThreadClusterSize));
__syncthreads();
accuData = type_convert<AccDataType>(block_val_buffer[offset]);
accuIndex = block_idx_buffer[offset];
}
};
};
}; // end of namespace ck
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace ck {
template <typename BufferType, typename opReduce, NanPropagation_t nanPropaOpt>
struct ThreadReduce
{
using compType = typename opReduce::dataType;
static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
static_assert(
std::is_same<typename BufferType::type, compType>::value,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
// This interface does not accumulate on indices
__device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { binop::calculate(accuData, thread_buffer[I]); });
};
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at first-time reduction
__device__ static void
Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
int currIndex = I + indexStart;
binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
};
};
template <typename BufferType,
typename IdxBufferType,
typename opReduce,
NanPropagation_t nanPropaOpt>
struct ThreadReduceWithIndicesInput
{
using compType = typename opReduce::dataType;
static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
static_assert(IdxBufferType::IsStaticBuffer(),
"Thread-wise reduction needs use StaticBuffer for indices!");
static_assert(
std::is_same<typename BufferType::type, compType>::value,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
static_assert(std::is_same<typename IdxBufferType::type, index_t>::value,
"Indices type of StaticBuffer for Thread-wise reduction should be index_t!");
static_assert(BufferType::Size() == IdxBufferType::Size(),
"StaticBuffers for data and indices should have the same sizes!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at second-time reduction
__device__ static void Reduce(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
};
};
}; // end of namespace ck
#endif
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include "data_type.hpp"
namespace ck {
namespace math {
static inline __device__ half_t abs(half_t x) { return __habs(x); };
static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
} // namespace math
} // namespace ck
#endif
......@@ -48,6 +48,18 @@ struct float_equal_zero
};
};
template <index_t N>
static constexpr __device__ index_t get_shift()
{
return (get_shift<N / 2>() + 1);
};
template <>
constexpr __device__ index_t get_shift<1>()
{
return (0);
}
}; // end of namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment