Unverified Commit f305bebd authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Merge pull request #31 from ROCmSoftwarePlatform/miopen_downstream-dynamic_reduction_pr

[MIOpen Downstream] Dynamic Reduction PR
parents f3acd251 b725e3fc
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_blockwise.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType,
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
bool isFirstCall,
bool isLastCall,
index_t GredAccessesPerThreadInBlock>
struct GridwiseReduction_xy_to_x_blockwise
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
using posUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
using blockwise_reduce =
BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
(void)indices_global;
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
auto zeroVal = opReduce::GetZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto in_block_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const posUnaryOpType posUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
constexpr auto in_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load =
BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(src2dDesc,
make_multi_index(block_global_1d_id, 0),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
__syncthreads();
// do element-wise pre-reduction operation
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
}
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
threadwise_dst_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
}
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
__shared__ int block_indices_buffer[BlockBufferSize];
auto zeroVal = opReduce::GetZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
auto in_block_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
auto in_block_idx_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
constexpr auto in_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load =
BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
dstDataType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(src2dDesc,
make_multi_index(block_global_1d_id, 0),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
int indexOffset = 0;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
// load block data from global to LDS, no use of double buffers (to be improved)
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
__syncthreads();
// construct the indices for the current toReduce blocks
blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce2(in_block_val_buf,
in_block_idx_buf,
BlocksInOneOp,
accuValue_buf(I0),
accuIndex_buf(I0));
indexOffset += BlockBufferSize;
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(dst1dDesc,
dst_global_val_buf,
ReducedDataDesc,
make_tuple(I0),
priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
};
template <>
__device__ static void Run<3>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ ws_values_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)origReduceLen;
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
__shared__ int block_indices_buffer[BlockBufferSize];
auto zeroVal = opReduce::GetZeroVal();
const auto src_global_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
src2dDesc.GetElementSpaceSize(),
type_convert<srcDataType>{}(zeroVal));
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, src2dDesc.GetElementSpaceSize());
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
auto in_block_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
auto in_block_idx_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
constexpr auto in_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_val_load =
BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(src2dDesc,
make_multi_index(block_global_1d_id, 0),
in_block_desc,
make_multi_index(0, 0));
auto blockwise_src_idx_load =
BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
int,
int,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(src2dDesc,
make_multi_index(block_global_1d_id, 0),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
// load block data from global to LDS, no use of double buffers (to be improved)
blockwise_src_val_load.RunRead(src2dDesc, src_global_val_buf);
blockwise_src_idx_load.RunRead(src2dDesc, src_global_idx_buf);
blockwise_src_val_load.RunWrite(in_block_desc, in_block_val_buf);
blockwise_src_idx_load.RunWrite(in_block_desc, in_block_idx_buf);
__syncthreads();
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce2(in_block_val_buf,
in_block_idx_buf,
BlocksInOneOp,
accuValue_buf(I0),
accuIndex_buf(I0));
blockwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
blockwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(dst1dDesc,
make_multi_index(block_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(dst1dDesc,
dst_global_val_buf,
ReducedDataDesc,
make_tuple(I0),
priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(block_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(block_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType,
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
bool isFirstCall,
bool isLastCall,
index_t GredThreadBufferLength>
struct GridwiseReduction_xy_to_x_direct_threadwise
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
using posUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
(void)indices_global;
const auto zeroVal = opReduce::GetZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
in_thread_buf;
using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const posUnaryOpType posUnaryOp(divider);
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc, make_multi_index(thread_global_1d_id, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += GredThreadBufferLength)
{
threadwise_src_load.Run(
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
// do element-wise pre-reduction operation
threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
// do the reduction on the Thread Buffer
threadwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(
dst1dDesc, make_multi_index(thread_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
}
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(thread_global_1d_id));
threadwise_dst_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
const auto zeroVal = opReduce::GetZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
in_thread_buf;
using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
dstDataType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc, make_multi_index(thread_global_1d_id, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
index_t indexStart = 0;
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += GredThreadBufferLength)
{
threadwise_src_load.Run(
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
// do the reduction on the Thread Buffer
threadwise_reduce::Reduce2(
in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexStart);
indexStart += GredThreadBufferLength;
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
false>(
dst1dDesc, make_multi_index(thread_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(thread_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(thread_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
};
template <>
__device__ static void Run<3>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ ws_values_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)origReduceLen;
const auto zeroVal = opReduce::GetZeroVal();
const auto src_global_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
src2dDesc.GetElementSpaceSize(),
type_convert<srcDataType>{}(zeroVal));
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, src2dDesc.GetElementSpaceSize());
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
in_thread_val_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredThreadBufferLength, true> in_thread_idx_buf;
using threadwise_reduce = ThreadReduceWithIndicesInput<decltype(in_thread_val_buf),
decltype(in_thread_idx_buf),
opReduce,
nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
dstDataType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc, make_multi_index(thread_global_1d_id, 0));
auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
int,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc, make_multi_index(thread_global_1d_id, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += GredThreadBufferLength)
{
threadwise_src_val_load.Run(src2dDesc,
src_global_val_buf,
ThreadBufferDesc,
make_tuple(I0, I0),
in_thread_val_buf);
threadwise_src_idx_load.Run(src2dDesc,
src_global_idx_buf,
ThreadBufferDesc,
make_tuple(I0, I0),
in_thread_idx_buf);
// do the reduction on the Thread Buffer
threadwise_reduce::Reduce(
in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
false>(
dst1dDesc, make_multi_index(thread_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(thread_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(thread_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_warpwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType,
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
bool isFirstCall,
bool isLastCall,
index_t GredAccessesPerThreadInWarp>
struct GridwiseReduction_xy_to_x_direct_warpwise
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
using posUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
(void)indices_global;
auto zeroVal = opReduce::GetZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
in_thread_buf;
using warpwise_reduce =
WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const posUnaryOpType posUnaryOp(divider);
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc,
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
constexpr auto in_thread_copy_step =
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += warpSize * GredAccessesPerThreadInWarp)
{
threadwise_src_load.Run(
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
// do element-wise pre-reduction operation
warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if(thread_inwarp_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf(I0) * beta);
}
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
threadwise_dst_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
}
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
auto zeroVal = opReduce::GetZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
in_thread_buf;
using warpwise_reduce =
WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc,
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
constexpr auto in_thread_copy_step =
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
index_t indexOffset = 0;
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += warpSize * GredAccessesPerThreadInWarp)
{
threadwise_src_load.Run(
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce::Reduce2(
in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexOffset);
indexOffset += warpSize * GredAccessesPerThreadInWarp;
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if(thread_inwarp_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(dst1dDesc,
dst_global_val_buf,
ReducedDataDesc,
make_tuple(I0),
priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
};
template <>
__device__ static void Run<3>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ ws_values_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)origReduceLen;
auto zeroVal = opReduce::GetZeroVal();
const auto src_global_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
src2dDesc.GetElementSpaceSize(),
type_convert<srcDataType>{}(zeroVal));
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, src2dDesc.GetElementSpaceSize());
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
in_thread_val_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredAccessesPerThreadInWarp, true>
in_thread_idx_buf;
using warpwise_reduce = WarpReduceWithIndicesInput<decltype(in_thread_val_buf),
decltype(in_thread_idx_buf),
BlockSize,
opReduce,
nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc,
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
int,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc,
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
constexpr auto in_thread_copy_step =
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += warpSize * GredAccessesPerThreadInWarp)
{
threadwise_src_val_load.Run(src2dDesc,
src_global_val_buf,
ThreadBufferDesc,
make_tuple(I0, I0),
in_thread_val_buf);
threadwise_src_idx_load.Run(src2dDesc,
src_global_idx_buf,
ThreadBufferDesc,
make_tuple(I0, I0),
in_thread_idx_buf);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce::Reduce(
in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if(thread_inwarp_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(dst1dDesc,
dst_global_val_buf,
ReducedDataDesc,
make_tuple(I0),
priorDstValue_buf);
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_blockwise.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType, // not used together with the beta input
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
index_t GredAccessesPerThreadInBlock>
struct GridwiseReduction_xy_to_x_multiblock
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::preUnaryOp;
using posUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::posUnaryOp;
static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
using blockwise_reduce =
BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global)
{
(void)ws_indices_global;
(void)alpha; // unused
(void)beta; // unused
auto zeroVal = opReduce::GetZeroVal();
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto in_block_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / BlkGroupSize;
const index_t block_local_id = block_global_id % BlkGroupSize;
const index_t reduceSizePerBlock =
(((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
BlockBufferSize) *
BlockBufferSize;
constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(
src2dDesc,
make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
__syncthreads();
// do element-wise pre-reduction operation
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
const auto workspace_desc =
make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
auto threadwise_workspace_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
srcDataType,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
threadwise_workspace_store.Run(ReducedDataDesc,
make_tuple(I0),
accuValue_buf,
workspace_desc,
workspace_global_buf);
}
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global)
{
(void)alpha; // unused
(void)beta; // unused
auto zeroVal = opReduce::GetZeroVal();
// LDS
__shared__ compType p_in_block_values_buffer[BlockBufferSize];
__shared__ int p_in_block_indices_buffer[BlockBufferSize];
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto in_block_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_values_buffer, BlockBufferSize);
auto in_block_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
p_in_block_indices_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / BlkGroupSize;
const index_t block_local_id = block_global_id % BlkGroupSize;
const index_t reduceSizePerBlock =
(((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
BlockBufferSize) *
BlockBufferSize;
constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(
src2dDesc,
make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
int indexOffset = block_local_id * reduceSizePerBlock;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
__syncthreads();
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce2(in_block_val_buf,
in_block_idx_buf,
BlocksInOneOp,
accuValue_buf(I0),
accuIndex_buf(I0));
indexOffset += BlockBufferSize;
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
const auto workspace_desc =
make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
auto threadwise_workspace_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
srcDataType,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
auto threadwise_workspace_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
threadwise_workspace_val_store.Run(ReducedDataDesc,
make_tuple(I0),
accuValue_buf,
workspace_desc,
workspace_global_val_buf);
threadwise_workspace_idx_store.Run(ReducedDataDesc,
make_tuple(I0),
accuIndex_buf,
workspace_desc,
workspace_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace ck {
template <typename buffer2dDescType,
bool blockIsOneRow,
typename opReduce,
NanPropagation_t nanPropaOpt>
struct BlockwiseReduction_2d_block_buffer
{
using compType = typename opReduce::dataType;
static constexpr auto buffer2dDesc = buffer2dDescType{};
static constexpr index_t BlockSize =
blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{});
static constexpr index_t NumBlocks =
blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{});
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
// This interface does not accumulate on indices
template <typename BufferType>
__device__ static void
Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
{
const index_t thread_local_id = get_thread_local_1d_id();
compType lAccuData = opReduce::GetZeroVal();
index_t offset;
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
compType opData = type_convert<compType>{}(block_buffer[offset]);
binop::calculate(lAccuData, opData);
}
offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
block_buffer(offset) = lAccuData;
__syncthreads();
for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2)
{
if(thread_local_id < indOffset)
{
index_t offset1 =
blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
index_t offset2 =
blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
compType opData1 = type_convert<compType>{}(block_buffer[offset1]);
compType opData2 = type_convert<compType>{}(block_buffer[offset2]);
binop::calculate(opData1, opData2);
block_buffer(offset1) = type_convert<compType>{}(opData1);
}
__syncthreads();
}
if(thread_local_id == 0)
{
compType tmpVal = type_convert<compType>{}(block_buffer[0]);
binop::calculate(accuData, tmpVal);
}
};
// This interface accumulates on both data values and indices
template <typename BufferType, typename IdxBufferType>
__device__ static void Reduce2(BufferType& block_buffer,
IdxBufferType& block_indices_buffer,
index_t toReduceBlocks,
compType& accuData,
int& accuIndex)
{
const index_t thread_local_id = get_thread_local_1d_id();
compType lAccuData = opReduce::GetZeroVal();
int lAccuIndex = 0;
if constexpr(blockIsOneRow)
{
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
{
if(thread_local_id % (indOffset * 2) == 0)
{
index_t offset1 =
buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id));
index_t offset2 = buffer2dDesc.CalculateOffset(
make_tuple(otherDimInd, thread_local_id + indOffset));
compType currVal1 = type_convert<compType>{}(block_buffer[offset1]);
compType currVal2 = type_convert<compType>{}(block_buffer[offset2]);
int currIndex1 = block_indices_buffer[offset1];
int currIndex2 = block_indices_buffer[offset2];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
block_buffer(offset1) = type_convert<compType>{}(currVal1);
block_indices_buffer(offset1) = currIndex1;
}
__syncthreads();
}
}
if(thread_local_id == 0)
{
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0));
compType tmpVal = type_convert<compType>{}(block_buffer[offset]);
int tmpIndex = block_indices_buffer[offset];
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
}
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
}
}
else
{
index_t offset;
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
compType currVal = type_convert<compType>{}(block_buffer[offset]);
int currIndex = block_indices_buffer[offset];
binop::calculate(lAccuData, currVal, lAccuIndex, currIndex);
}
offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
block_buffer(offset) = lAccuData;
block_indices_buffer(offset) = lAccuIndex;
__syncthreads();
for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
{
if(thread_local_id % (indOffset * 2) == 0)
{
index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
index_t offset2 =
buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
compType currVal1 = type_convert<compType>{}(block_buffer[offset1]);
compType currVal2 = type_convert<compType>{}(block_buffer[offset2]);
int currIndex1 = block_indices_buffer[offset1];
int currIndex2 = block_indices_buffer[offset2];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
block_buffer(offset1) = type_convert<compType>{}(currVal1);
block_indices_buffer(offset1) = currIndex1;
}
__syncthreads();
}
if(thread_local_id == 0)
{
compType tmpVal = type_convert<compType>{}(block_buffer[0]);
int tmpIndex = block_indices_buffer[0];
binop::calculate(accuData, tmpVal, accuIndex, tmpIndex);
}
}
};
template <typename BufferType>
__device__ static void set_buffer_value(BufferType& block_buffer, compType value)
{
index_t thread_id = get_thread_local_1d_id();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
block_buffer(offset) = value;
__syncthreads();
}
};
// Initialize the block-wise indices buffer, the index for each element in the block-wise data
// buffer
// is calculated according to its position in the buffer and the global starting index
template <typename IdxBufferType>
__device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart)
{
index_t thread_id = get_thread_local_1d_id();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
block_indices_buffer(offset) = offset + indexStart;
__syncthreads();
}
};
// Execute unary operation on the block buffer elements
template <typename unary_op_type, typename BufferType>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer)
{
index_t thread_id = get_thread_local_1d_id();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
block_buffer(offset) = unary_op(block_buffer[offset]);
__syncthreads();
}
};
};
}; // end of namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace ck {
template <typename BufferType, typename opReduce, NanPropagation_t nanPropaOpt>
struct ThreadReduce
{
using compType = typename opReduce::dataType;
static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
static_assert(
std::is_same<typename BufferType::type, compType>::value,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
// This interface does not accumulate on indices
__device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { binop::calculate(accuData, thread_buffer[I]); });
};
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at first-time reduction
__device__ static void
Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
int currIndex = I + indexStart;
binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
};
};
template <typename BufferType,
typename IdxBufferType,
typename opReduce,
NanPropagation_t nanPropaOpt>
struct ThreadReduceWithIndicesInput
{
using compType = typename opReduce::dataType;
static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
static_assert(IdxBufferType::IsStaticBuffer(),
"Thread-wise reduction needs use StaticBuffer for indices!");
static_assert(
std::is_same<typename BufferType::type, compType>::value,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
static_assert(std::is_same<typename IdxBufferType::type, index_t>::value,
"Indices type of StaticBuffer for Thread-wise reduction should be index_t!");
static_assert(BufferType::Size() == IdxBufferType::Size(),
"StaticBuffers for data and indices should have the same sizes!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at second-time reduction
__device__ static void Reduce(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
};
};
}; // end of namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace ck {
template <typename BufferType, index_t BlockSize, typename opReduce, NanPropagation_t nanPropaOpt>
struct WarpReduce
{
using compType = typename opReduce::dataType;
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
static_assert(BufferType::IsStaticBuffer(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer!");
static_assert(std::is_same<typename BufferType::type, compType>::value,
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
"the compType!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
static constexpr bool have_builtin_shuffle =
std::is_same<compType, float>::value || std::is_same<compType, double>::value;
// This interface does not accumulate on indices
__device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
{
if constexpr(have_builtin_shuffle)
ReduceImpl1(thread_buffer, accuData);
else
ReduceImpl2(thread_buffer, accuData);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData)
{
compType lAccuData = opReduce::GetZeroVal();
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
// synchronize among all threads in this warp
__all(1);
for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
{
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
binop::calculate(lAccuData, tmpVal);
__all(1);
}
binop::calculate(accuData, lAccuData);
};
// This interface implementation does not use HIP built-in device shuffling functions
// since for fp16, built-in shuffling functions is not provided by HIP
__device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData)
{
compType lAccuData = opReduce::GetZeroVal();
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
__syncthreads();
index_t thread_id = get_thread_local_1d_id();
index_t warpId = thread_id / warpSize;
index_t thread_inwarp_id = thread_id % warpSize;
__shared__ compType shuffle_buffer[BlockSize];
compType* myBuffer = &shuffle_buffer[warpId * warpSize];
myBuffer[thread_inwarp_id] = lAccuData;
__syncthreads();
for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
{
if(thread_inwarp_id < stride)
{
compType currVal1 = myBuffer[thread_inwarp_id];
compType currVal2 = myBuffer[thread_inwarp_id + stride];
binop::calculate(currVal1, currVal2);
myBuffer[thread_inwarp_id] = currVal1;
}
__syncthreads();
}
if(thread_inwarp_id == 0)
binop::calculate(accuData, myBuffer[0]);
};
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
// reduction method at first-time reduction
__device__ static void
Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
{
if constexpr(have_builtin_shuffle)
Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart);
else
Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__ static void Reduce2Impl1(const BufferType& thread_buffer,
compType& accuData,
int& accuIndex,
int indexStart)
{
compType lAccuData = opReduce::GetZeroVal();
int lAccuIndex = 0;
index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize;
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
});
// synchronize among all threads in this warp
__all(1);
for(index_t stride = 1; stride < warpSize; stride *= 2)
{
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize);
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
__all(1);
}
if(thread_inwarp_id == 0)
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
};
// This interface implementation does not use HIP built-in device shuffling functions since for
// fp16, built-in shuffling functions is not provided by HIP
__device__ static void Reduce2Impl2(const BufferType& thread_buffer,
compType& accuData,
int& accuIndex,
int indexStart)
{
compType lAccuData = opReduce::GetZeroVal();
int lAccuIndex = 0;
index_t thread_id = get_thread_local_1d_id();
index_t warpId = thread_id / warpSize;
index_t thread_inwarp_id = thread_id % warpSize;
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
});
__shared__ compType shuffle_data_buffer[BlockSize];
__shared__ int shuffle_indices_buffer[BlockSize];
compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize];
myDataBuffer[thread_inwarp_id] = lAccuData;
myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
__syncthreads();
for(index_t stride = 1; stride < warpSize; stride *= 2)
{
compType currVal1 = myDataBuffer[thread_inwarp_id];
compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
int currIndex1 = myIndicesBuffer[thread_inwarp_id];
int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
myDataBuffer[thread_inwarp_id] = currVal1;
myIndicesBuffer[thread_inwarp_id] = currIndex1;
__syncthreads();
}
if(thread_inwarp_id == 0)
binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
};
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
__all(1);
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
__all(1);
};
};
template <typename BufferType,
typename IdxBufferType,
index_t BlockSize,
typename opReduce,
NanPropagation_t nanPropaOpt>
struct WarpReduceWithIndicesInput
{
using compType = typename opReduce::dataType;
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
static_assert(BufferType::IsStaticBuffer(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer!");
static_assert(IdxBufferType::IsStaticBuffer(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!");
static_assert(std::is_same<typename BufferType::type, compType>::value,
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
"the compType!");
static_assert(
std::is_same<typename IdxBufferType::type, index_t>::value,
"Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!");
static_assert(BufferType::Size() == IdxBufferType::Size(),
"StaticBuffers for data and indices should have the same sizes!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
static constexpr bool have_builtin_shuffle =
std::is_same<compType, float>::value || std::is_same<compType, double>::value;
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
// reduction method at second-time reduction
__device__ static void Reduce(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
if constexpr(have_builtin_shuffle)
ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex);
else
ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__ static void ReduceImpl1(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
compType lAccuData = opReduce::GetZeroVal();
int lAccuIndex = 0;
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
});
// synchronize among all threads in this warp
__all(1);
for(index_t stride = 1; stride < warpSize; stride *= 2)
{
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize);
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
__all(1);
}
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
};
// This interface implementation does not use HIP built-in device shuffling functions
// since for fp16, built-in shuffling functions is not provided by HIP
__device__ static void ReduceImpl2(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
compType lAccuData = opReduce::GetZeroVal();
int lAccuIndex = 0;
index_t thread_id = get_thread_local_1d_id();
index_t warpId = thread_id / warpSize;
index_t thread_inwarp_id = thread_id % warpSize;
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
});
__shared__ compType shuffle_data_buffer[BlockSize];
__shared__ int shuffle_indices_buffer[BlockSize];
compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize];
myDataBuffer[thread_inwarp_id] = lAccuData;
myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
__syncthreads();
for(index_t stride = 1; stride < warpSize; stride *= 2)
{
compType currVal1 = myDataBuffer[thread_inwarp_id];
compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
int currIndex1 = myIndicesBuffer[thread_inwarp_id];
int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
myDataBuffer[thread_inwarp_id] = currVal1;
myIndicesBuffer[thread_inwarp_id] = currIndex1;
__syncthreads();
}
if(thread_inwarp_id == 0)
binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
};
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
__all(1);
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
__all(1);
};
};
}; // end of namespace ck
#endif
...@@ -713,9 +713,6 @@ struct ThreadwiseTensorSliceTransfer_v3 ...@@ -713,9 +713,6 @@ struct ThreadwiseTensorSliceTransfer_v3
: src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
{ {
// TODO: fix this
static_assert(is_same<SrcData, DstData>::value,
"wrong! current implementation assume SrcData and DstData are same type");
} }
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
...@@ -985,7 +982,8 @@ struct ThreadwiseTensorSliceTransfer_v3 ...@@ -985,7 +982,8 @@ struct ThreadwiseTensorSliceTransfer_v3
constexpr index_t buffer_offset = constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector); buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}]; dst_tmp_vector.template AsType<DstData>()(i) =
type_convert<DstData>{}(buffer_[Number<buffer_offset>{}]);
}); });
using dst_vector_t = typename decltype(dst_tmp_vector)::type; using dst_vector_t = typename decltype(dst_tmp_vector)::type;
......
...@@ -38,6 +38,10 @@ struct DynamicBuffer ...@@ -38,6 +38,10 @@ struct DynamicBuffer
return BufferAddressSpace; return BufferAddressSpace;
} }
__host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
__host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
template <typename X, template <typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type, typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value, typename scalar_type<remove_cvref_t<T>>::type>::value,
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_COMMON_HPP
#define CK_REDUCTION_COMMON_HPP
// this enumerate should be synchronized with include/miopen/reduce_common.hpp
namespace ck {
enum class ReductionMethod_t
{
DirectThreadWise = 1,
DirectWarpWise = 2,
BlockWise = 3,
MultiBlock = 4
}; // end of namespace ck
enum class ReduceTensorOp_t
{
ADD = 0,
MUL = 1,
MIN = 2,
MAX = 3,
AMAX = 4,
AVG = 5,
NORM1 = 6,
NORM2 = 7,
// MUL_NO_ZEROS = 8,
};
enum class NanPropagation_t
{
NOT_PROPAGATE_NAN = 0,
PROPAGATE_NAN = 1,
};
enum class ReduceTensorIndices_t
{
NO_INDICES = 0,
FLATTENED_INDICES = 1,
};
enum class IndicesType_t
{
INDICES_32BIT = 0,
INDICES_64BIT = 1,
INDICES_16BIT = 2,
INDICES_8BIT = 3,
};
struct float_equal_one
{
template <class T>
__device__ static inline bool apply(T x)
{
return x <= type_convert<T>{}(1.0f) and x >= type_convert<T>{}(1.0f);
}
template <class T>
__device__ inline bool operator()(T x)
{
return (float_equal_one::apply(x));
};
};
struct float_equal_zero
{
template <class T>
__device__ static inline bool apply(T x)
{
return x <= type_convert<T>{}(0.0f) and x >= type_convert<T>{}(0.0f);
}
template <class T>
__device__ inline bool operator()(T x)
{
return (float_equal_zero::apply(x));
};
};
}; // end of namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP
#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
namespace ck {
namespace detail {
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
template <NanPropagation_t nanPropaOpt, typename opReduce, typename compType>
struct binop_with_nan_check;
template <typename opReduce, typename compType>
struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compType>
{
// cppcheck-suppress constParameter
__device__ static inline void calculate(compType& accuVal, compType currVal)
{
opReduce{}(accuVal, currVal);
};
// The method is called when the opReduce is indexable and the user asked for indices
__device__ static inline void
// cppcheck-suppress constParameter
calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
{
bool changed = false;
opReduce{}(accuVal, currVal, changed);
if(changed)
accuIndex = currIndex;
};
};
template <typename opReduce, typename compType>
struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
{
__device__ static inline void calculate(compType& accuVal, compType currVal)
{
if(isnan(currVal))
accuVal = currVal;
else
opReduce{}(accuVal, currVal);
};
// The method is called when the opReduce is indexable and the user asked for indices
__device__ static inline void
calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
{
if(isnan(currVal))
{
accuVal = currVal;
accuIndex = currIndex;
}
else
{
bool changed = false;
opReduce{}(accuVal, currVal, changed);
if(changed)
accuIndex = currIndex;
}
};
};
}; // namespace detail
}; // end of namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_OPERATOR_HPP
#define CK_REDUCTION_OPERATOR_HPP
#include "reduction_common.hpp"
namespace ck {
namespace reduce {
// Every binary operator used in reduction is represented by a templated functor class. Each functor
// class must provide at least
// three members:
// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator,
// "identity element" is the unique
// element in the algebraic space that doesn't affect the value of other elements
// when operated with any of them.
// 2) indexable -- boolean value indicating whether indices of the operated elements could be
// recorded. Usually, Min/Max operator could
// need to record the indices of elements. For operator like Add/Mul, no need to
// record the indices.
// 3) operator() -- the first argument of the operator must be both an input & output, and the
// corresponding variable usually stores
// the accumulated result of many operator() calls; the second argument is only an
// input. For indexable binary
// operator, the second version of operator() has third argument (which is an
// output) to indicate whether the
// accumulated value (the first argument) has changed, in which case the recorded
// accumulated index also need be
// changed.
template <class T>
struct Add
{
using dataType = T;
__device__ static T GetZeroVal() { return type_convert<T>{}(0.0f); };
__device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
static constexpr bool indexable = false;
};
template <class T>
struct Mul
{
using dataType = T;
__device__ static T GetZeroVal() { return type_convert<T>{}(1.0f); };
__device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
static constexpr bool indexable = false;
};
template <class T>
struct Max
{
using dataType = T;
__device__ static T GetZeroVal() { return std::numeric_limits<T>::min(); };
__device__ inline constexpr void operator()(T& a, T b) const
{
if(a < b)
a = b;
}
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{
if(a < b)
{
a = b;
changed = true;
}
}
static constexpr bool indexable = true;
};
template <class T>
struct Min
{
using dataType = T;
__device__ static T GetZeroVal() { return std::numeric_limits<T>::max(); };
__device__ inline constexpr void operator()(T& a, T b) const
{
if(a > b)
a = b;
}
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const
{
if(a > b)
{
a = b;
changed = true;
}
}
static constexpr bool indexable = true;
};
template <>
__device__ half_t Max<half_t>::GetZeroVal()
{
return type_convert<half_t>{}(std::numeric_limits<float>::min());
};
template <>
__device__ half_t Min<half_t>::GetZeroVal()
{
return type_convert<half_t>{}(std::numeric_limits<float>::max());
};
// Unary operators are usually called element-wisely before the reduction is executed on the
// elements.
// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template <class T, bool hasDividing>
struct unary_identic
{
__device__ unary_identic(const int divider = 1)
{
scaler = 1.0f / static_cast<float>(divider);
};
__device__ inline constexpr T operator()(T a) const { return a * type_convert<T>{}(scaler); };
float scaler = 1.0f;
};
template <class T>
struct unary_identic<T, false>
{
__device__ unary_identic(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return a; };
};
template <class T, bool hasDividing>
struct unary_square
{
__device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline constexpr T operator()(T a) const
{
a = a * a;
return a * type_convert<T>{}(scaler);
};
float scaler = 1.0f;
};
template <class T>
struct unary_square<T, false>
{
__device__ unary_square(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return a * a; };
};
template <class T, bool hasDividing>
struct unary_abs
{
__device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline constexpr T operator()(T a) const
{
a = abs(a);
return a * type_convert<T>{}(scaler);
};
float scaler = 1.0f;
};
template <class T>
struct unary_abs<T, false>
{
__device__ unary_abs(const int divider = 1) { (void)divider; };
__device__ inline constexpr T operator()(T a) const { return abs(a); };
};
// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
// Let's assume that __habs() exists since 3.5.
#if HIP_PACKAGE_VERSION_FLAT < 3005000000
inline __device__ __half __habs(__half x)
{
union
{
__half half;
unsigned short u16;
} val;
val.half = x;
val.u16 = val.u16 & 0x7fff;
return val.half;
}
#endif
template <bool hasDividing>
struct unary_abs<half_t, hasDividing>
{
__device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
__device__ inline half_t operator()(half_t a) const
{
a = static_cast<half_t>(__habs(a));
return a * type_convert<half_t>{}(scaler);
};
float scaler = 1.0f;
};
template <>
struct unary_abs<half_t, false>
{
__device__ unary_abs(const int divider = 1) { (void)divider; };
__device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(__habs(a)); };
};
template <class T>
struct unary_sqrt
{
__device__ unary_sqrt(const int divider = 1) { (void)divider; };
__device__ inline T operator()(T a) const { return sqrtf(a); };
};
template <>
struct unary_sqrt<half_t>
{
__device__ unary_sqrt(const int divider = 1) { (void)divider; };
__device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(hsqrt(a)); };
};
}; // end of namespace reduce
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
// respective functor classes.
// The "GetZeroVal()" interface and boolean member "indexable" are also provided in
// reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template <typename T, ReduceTensorOp_t op>
struct reduce_binary_operator;
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
{
using opType = reduce::Add<T>;
using dataType = T;
__device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
{
using opType = reduce::Mul<T>;
using dataType = T;
__device__ static T GetZeroVal() { return reduce::Mul<T>::GetZeroVal(); };
static constexpr bool indexable = reduce::Mul<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
{
using opType = reduce::Min<T>;
using dataType = T;
__device__ static T GetZeroVal() { return reduce::Min<T>::GetZeroVal(); };
static constexpr bool indexable = reduce::Min<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
{
using opType = reduce::Max<T>;
using dataType = T;
__device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
static constexpr bool indexable = reduce::Max<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
{
using opType = reduce::Max<T>;
using dataType = T;
__device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
static constexpr bool indexable = reduce::Max<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
{
using opType = reduce::Add<T>;
using dataType = T;
__device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
{
using opType = reduce::Add<T>;
using dataType = T;
__device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
static constexpr bool indexable = reduce::Add<T>::indexable;
};
template <typename T>
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
{
using opType = reduce::Add<T>;
using dataType = T;
__device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
static constexpr bool indexable = reduce::Add<T>::indexable;
};
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template <typename T, ReduceTensorOp_t op, bool isFirsReduce, bool isLastReduce>
struct reduce_unary_operator
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T, bool isFirstReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, isFirstReduce, true>
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_identic<T, true>;
};
template <typename T, bool isLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, isLastReduce>
{
using preUnaryOp = reduce::unary_abs<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T, bool isLastReduce>
struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, isLastReduce>
{
using preUnaryOp = reduce::unary_abs<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
{
using preUnaryOp = reduce::unary_square<T, false>;
using posUnaryOp = reduce::unary_identic<T, false>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
{
using preUnaryOp = reduce::unary_square<T, false>;
using posUnaryOp = reduce::unary_sqrt<T>;
};
template <typename T>
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
{
using preUnaryOp = reduce::unary_identic<T, false>;
using posUnaryOp = reduce::unary_sqrt<T>;
};
} // end of namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
////////////////////////////////////////////////////////////////////////////////////////
using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
"Wrong invariant and/or toReduce dimensions!");
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert(dstDims == 1,
"If all source dimensions are reduced, the dest should have only one dimension !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outLength0,
int outLength1,
int outLength2,
int outLength3,
int outLength4,
int outLength5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)GridSize;
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstLengths[6] = {
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto one_dim_srcDesc = transform_tensor_descriptor(
srcDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
auto src2dDesc = transform_tensor_descriptor(
one_dim_srcDesc,
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
if constexpr(src2d_need_padding)
{
const auto srcPad =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pass_through_transform(invariantLen),
make_pad_transform(toReduceLen, 0, srcPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
};
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_src2dDesc =
transform_tensor_descriptor(ref_one_dim_srcDesc,
make_tuple(make_unmerge_transform(
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the BlockWise and MultiBlock method
using refType_src2dDesc_padded_34 = decltype(
transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pass_through_transform(ref_invariantLen),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_34 =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
template <ReductionMethod_t impl, bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
void* __restrict__ ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = ws_global;
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredAccessesPerThreadInBlock>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
////////////////////////////////////////////////////////////////////////////////////////
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
"Wrong invariant and/or toReduce dimensions!");
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert(invariantDims::Size() > 0 || dstDims == 1,
"If all source dimensions are reduced, the dest should have only one dimension !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outLength0,
int outLength1,
int outLength2,
int outLength3,
int outLength4,
int outLength5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)GridSize;
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstLengths[6] = {
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
auto src2dDesc =
transform_tensor_descriptor(srcDesc,
make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
if constexpr(src2d_need_padding)
{
const auto srcPad =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pass_through_transform(invariantLen),
make_pad_transform(toReduceLen, 0, srcPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
};
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_toReduceDimLengths =
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
static constexpr auto ref_invariantDimLengths =
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the BlockWise and MultiBlock method
using refType_src2dDesc_padded_34 = decltype(
transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pass_through_transform(ref_invariantLen),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_34 =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_src2dDesc_padded_34;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
void* __restrict__ ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = ws_global;
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredAccessesPerThreadInBlock>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
////////////////////////////////////////////////////////////////////////////////////////
using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
"Wrong invariant and/or toReduce dimensions!");
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert(dstDims == 1,
"If all source dimensions are reduced, the dest should have only one dimension !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outLength0,
int outLength1,
int outLength2,
int outLength3,
int outLength4,
int outLength5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)GridSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstLengths[6] = {
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto one_dim_srcDesc = transform_tensor_descriptor(
srcDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
auto src2dDesc = transform_tensor_descriptor(
one_dim_srcDesc,
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
const index_t reduceSizePerBlock =
(((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
copySliceLen;
if constexpr(src2d_need_padding)
{
const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pass_through_transform(invariantLen),
make_pad_transform(toReduceLen, 0, srcPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
};
template <index_t srcDims, index_t dstDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_src2dDesc =
transform_tensor_descriptor(ref_one_dim_srcDesc,
make_tuple(make_unmerge_transform(
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the BlockWise and MultiBlock method
using refType_src2dDesc_padded_34 = decltype(
transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pass_through_transform(ref_invariantLen),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_34 =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
void* __restrict__ ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)p_dst_global;
(void)indices_global;
const void* p_src2dDesc = ws_global;
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
void* ws_buf1_global = static_cast<char*>(ws_global) + 4096;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
GredAccessesPerThreadInBlock>;
void* const ws_buf2_global =
ws_buf2_bytes_offset > 0
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
: nullptr;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
BlkGroupSize,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<srcDataType* const __restrict__>(ws_buf1_global),
static_cast<int* const __restrict__>(ws_buf2_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
////////////////////////////////////////////////////////////////////////////////////////
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
"Wrong invariant and/or toReduce dimensions!");
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert(invariantDims::Size() > 0 || dstDims == 1,
"If all source dimensions are reduced, the dest should have only one dimension !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outLength0,
int outLength1,
int outLength2,
int outLength3,
int outLength4,
int outLength5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)GridSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstLengths[6] = {
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
auto src2dDesc =
transform_tensor_descriptor(srcDesc,
make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
const index_t reduceSizePerBlock =
(((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
copySliceLen;
if constexpr(src2d_need_padding)
{
const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pass_through_transform(invariantLen),
make_pad_transform(toReduceLen, 0, srcPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
};
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_toReduceDimLengths =
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
static constexpr auto ref_invariantDimLengths =
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the BlockWise and MultiBlock method
using refType_src2dDesc_padded_34 = decltype(
transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pass_through_transform(ref_invariantLen),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_34 =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_src2dDesc_padded_34;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
void* __restrict__ ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)p_dst_global;
(void)indices_global;
const void* p_src2dDesc = ws_global;
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
void* ws_buf1_global = static_cast<char*>(ws_global) + 4096;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
GredAccessesPerThreadInBlock>;
void* const ws_buf2_global =
ws_buf2_bytes_offset > 0
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
: nullptr;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
BlkGroupSize,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<srcDataType* const __restrict__>(ws_buf1_global),
static_cast<int* const __restrict__>(ws_buf2_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
////////////////////////////////////////////////////////////////////////////////////////
using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
"Wrong invariant and/or toReduce dimensions!");
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert(dstDims == 1,
"If all source dimensions are reduced, the dest should have only one dimension !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outLength0,
int outLength1,
int outLength2,
int outLength3,
int outLength4,
int outLength5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstLengths[6] = {
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto one_dim_srcDesc = transform_tensor_descriptor(
srcDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
auto src2dDesc = transform_tensor_descriptor(
one_dim_srcDesc,
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = GredThreadBufferLength;
if constexpr(src2d_need_padding)
{
const auto srcPad1 = GridSize * BlockSize - invariantLen;
const auto srcPad2 =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
make_pad_transform(toReduceLen, 0, srcPad2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if constexpr(dst1d_need_padding)
{
const auto dstPad = GridSize * BlockSize - invariantLen;
auto dst1dDesc_2 =
transform_tensor_descriptor(dst1dDesc,
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
}
};
template <index_t srcDims, index_t dstDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_src2dDesc =
transform_tensor_descriptor(ref_one_dim_srcDesc,
make_tuple(make_unmerge_transform(
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the DirectThreadWise and DirectWarpWise method
using refType_src2dDesc_padded_12 =
decltype(transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_12 =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_12;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
void* __restrict__ ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = ws_global;
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredThreadBufferLength>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
////////////////////////////////////////////////////////////////////////////////////////
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
"Wrong invariant and/or toReduce dimensions!");
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert(invariantDims::Size() > 0 || dstDims == 1,
"If all source dimensions are reduced, the dest should have only one dimension !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outLength0,
int outLength1,
int outLength2,
int outLength3,
int outLength4,
int outLength5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstLengths[6] = {
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
auto src2dDesc =
transform_tensor_descriptor(srcDesc,
make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = GredThreadBufferLength;
if constexpr(src2d_need_padding)
{
const auto srcPad1 = GridSize * BlockSize - invariantLen;
const auto srcPad2 =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
make_pad_transform(toReduceLen, 0, srcPad2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if constexpr(dst1d_need_padding)
{
const auto dstPad = GridSize * BlockSize - invariantLen;
auto dst1dDesc_2 =
transform_tensor_descriptor(dst1dDesc,
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
}
};
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_toReduceDimLengths =
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
static constexpr auto ref_invariantDimLengths =
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the DirectThreadWise and DirectWarpWise method
using refType_src2dDesc_padded_12 =
decltype(transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_12 =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_src2dDesc_padded_12;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
void* __restrict__ ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = ws_global;
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredThreadBufferLength>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
////////////////////////////////////////////////////////////////////////////////////////
using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
"Wrong invariant and/or toReduce dimensions!");
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert(dstDims == 1,
"If all source dimensions are reduced, the dest should have only one dimension !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outLength0,
int outLength1,
int outLength2,
int outLength3,
int outLength4,
int outLength5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstLengths[6] = {
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto one_dim_srcDesc = transform_tensor_descriptor(
srcDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
auto src2dDesc = transform_tensor_descriptor(
one_dim_srcDesc,
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
if constexpr(src2d_need_padding)
{
const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
const auto srcPad2 =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
make_pad_transform(toReduceLen, 0, srcPad2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if constexpr(dst1d_need_padding)
{
const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
auto dst1dDesc_2 =
transform_tensor_descriptor(dst1dDesc,
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
}
};
template <index_t srcDims, index_t dstDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_src2dDesc =
transform_tensor_descriptor(ref_one_dim_srcDesc,
make_tuple(make_unmerge_transform(
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the DirectThreadWise and DirectWarpWise method
using refType_src2dDesc_padded_12 =
decltype(transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_12
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_12;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
void* __restrict__ ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = ws_global;
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce =
GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredAccessesPerThreadInWarp>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
////////////////////////////////////////////////////////////////////////////////////////
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
"Wrong invariant and/or toReduce dimensions!");
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert(invariantDims::Size() > 0 || dstDims == 1,
"If all source dimensions are reduced, the dest should have only one dimension !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outLength0,
int outLength1,
int outLength2,
int outLength3,
int outLength4,
int outLength5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstLengths[6] = {
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
auto src2dDesc =
transform_tensor_descriptor(srcDesc,
make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
if constexpr(src2d_need_padding)
{
const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
const auto srcPad2 =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
make_pad_transform(toReduceLen, 0, srcPad2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if constexpr(dst1d_need_padding)
{
const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
auto dst1dDesc_2 =
transform_tensor_descriptor(dst1dDesc,
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
}
else
{
if(hipThreadIdx_x == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
}
};
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_toReduceDimLengths =
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
static constexpr auto ref_invariantDimLengths =
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the DirectThreadWise and DirectWarpWise method
using refType_src2dDesc_padded_12 =
decltype(transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_12 =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_src2dDesc_padded_12;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
void* __restrict__ ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = ws_global;
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce =
GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredAccessesPerThreadInWarp>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment