Commit f8a481f8 authored by zhouxiang's avatar zhouxiang
Browse files

添加dtk中的cub头文件

parent 7b7c64c5
/opt/dtk-23.04/cuda/include/cub
\ No newline at end of file
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
#include "../config.hpp"
#include <cub/rocprim/block/block_adjacent_difference.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
// Trait checks if FlagOp can be called with 3 arguments (a, b, b_index)
template<class T, class FlagOp, class = void>
struct WithBIndexArg
: std::false_type
{ };
template<class T, class FlagOp>
struct WithBIndexArg<
T, FlagOp,
typename std::conditional<
true,
void,
decltype(std::declval<FlagOp>()(std::declval<T>(), std::declval<T>(), 0))
>::type
> : std::true_type
{ };
}
template<
typename T,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockAdjacentDifference
: private ::rocprim::block_adjacent_difference<
T,
BLOCK_DIM_X,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_adjacent_difference<
T,
BLOCK_DIM_X,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockAdjacentDifference() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockAdjacentDifference(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
[[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
HIPCUB_DEVICE inline
void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
base_type::flag_heads(head_flags, input, flag_op, temp_storage_);
HIPCUB_CLANG_SUPPRESS_WARNING_POP
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
[[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
HIPCUB_DEVICE inline
void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op,
T tile_predecessor_item)
{
HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
base_type::flag_heads(head_flags, tile_predecessor_item, input, flag_op, temp_storage_);
HIPCUB_CLANG_SUPPRESS_WARNING_POP
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
[[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
HIPCUB_DEVICE inline
void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
base_type::flag_tails(tail_flags, input, flag_op, temp_storage_);
HIPCUB_CLANG_SUPPRESS_WARNING_POP
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
[[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
HIPCUB_DEVICE inline
void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op,
T tile_successor_item)
{
HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
base_type::flag_tails(tail_flags, tile_successor_item, input, flag_op, temp_storage_);
HIPCUB_CLANG_SUPPRESS_WARNING_POP
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
[[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
HIPCUB_DEVICE inline
void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
base_type::flag_heads_and_tails(
head_flags, tail_flags, input,
flag_op, temp_storage_
);
HIPCUB_CLANG_SUPPRESS_WARNING_POP
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
[[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
HIPCUB_DEVICE inline
void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T tile_successor_item,
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
base_type::flag_heads_and_tails(
head_flags, tail_flags, tile_successor_item, input,
flag_op, temp_storage_
);
HIPCUB_CLANG_SUPPRESS_WARNING_POP
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
[[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
HIPCUB_DEVICE inline
void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
T tile_predecessor_item,
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
base_type::flag_heads_and_tails(
head_flags, tile_predecessor_item, tail_flags, input,
flag_op, temp_storage_
);
HIPCUB_CLANG_SUPPRESS_WARNING_POP
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
[[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
HIPCUB_DEVICE inline
void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
T tile_predecessor_item,
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T tile_successor_item,
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
base_type::flag_heads_and_tails(
head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input,
flag_op, temp_storage_
);
HIPCUB_CLANG_SUPPRESS_WARNING_POP
}
template <int ITEMS_PER_THREAD, typename OutputType, typename DifferenceOpT>
HIPCUB_DEVICE inline
void SubtractLeft(T (&input)[ITEMS_PER_THREAD],
OutputType (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op)
{
base_type::subtract_left(
input, output, difference_op, temp_storage_
);
}
template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
HIPCUB_DEVICE inline
void SubtractLeft(T (&input)[ITEMS_PER_THREAD],
OutputT (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
T tile_predecessor_item)
{
base_type::subtract_left(
input, output, difference_op, tile_predecessor_item, temp_storage_
);
}
template <int ITEMS_PER_THREAD, typename OutputType, typename DifferenceOpT>
HIPCUB_DEVICE inline
void SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
OutputType (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
int valid_items)
{
base_type::subtract_left_partial(
input, output, difference_op, valid_items, temp_storage_
);
}
template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
HIPCUB_DEVICE inline
void SubtractRight(T (&input)[ITEMS_PER_THREAD],
OutputT (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op)
{
base_type::subtract_right(
input, output, difference_op, temp_storage_
);
}
template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
HIPCUB_DEVICE inline
void SubtractRight(T (&input)[ITEMS_PER_THREAD],
OutputT (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
T tile_successor_item)
{
base_type::subtract_right(
input, output, difference_op, tile_successor_item, temp_storage_
);
}
template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
HIPCUB_DEVICE inline
void SubtractRightPartialTile(T (&input)[ITEMS_PER_THREAD],
OutputT (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
int valid_items)
{
base_type::subtract_right_partial(
input, output, difference_op, valid_items, temp_storage_
);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
#include "../config.hpp"
#include <cub/rocprim/block/block_discontinuity.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename T,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockDiscontinuity
: private ::rocprim::block_discontinuity<
T,
BLOCK_DIM_X,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_discontinuity<
T,
BLOCK_DIM_X,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockDiscontinuity() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockDiscontinuity(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
HIPCUB_DEVICE inline
void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
base_type::flag_heads(head_flags, input, flag_op, temp_storage_);
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
HIPCUB_DEVICE inline
void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op,
T tile_predecessor_item)
{
base_type::flag_heads(head_flags, tile_predecessor_item, input, flag_op, temp_storage_);
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
HIPCUB_DEVICE inline
void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
base_type::flag_tails(tail_flags, input, flag_op, temp_storage_);
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
HIPCUB_DEVICE inline
void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op,
T tile_successor_item)
{
base_type::flag_tails(tail_flags, tile_successor_item, input, flag_op, temp_storage_);
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
HIPCUB_DEVICE inline
void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
base_type::flag_heads_and_tails(
head_flags, tail_flags, input,
flag_op, temp_storage_
);
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
HIPCUB_DEVICE inline
void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T tile_successor_item,
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
base_type::flag_heads_and_tails(
head_flags, tail_flags, tile_successor_item, input,
flag_op, temp_storage_
);
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
HIPCUB_DEVICE inline
void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
T tile_predecessor_item,
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
base_type::flag_heads_and_tails(
head_flags, tile_predecessor_item, tail_flags, input,
flag_op, temp_storage_
);
}
template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
HIPCUB_DEVICE inline
void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
T tile_predecessor_item,
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T tile_successor_item,
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
base_type::flag_heads_and_tails(
head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input,
flag_op, temp_storage_
);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
#include "../config.hpp"
#include <cub/rocprim/block/block_exchange.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename InputT,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
bool WARP_TIME_SLICING = false, /* ignored */
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockExchange
: private ::rocprim::block_exchange<
InputT,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_exchange<
InputT,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockExchange() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockExchange(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
template<typename OutputT>
HIPCUB_DEVICE inline
void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD])
{
base_type::striped_to_blocked(input_items, output_items, temp_storage_);
}
template<typename OutputT>
HIPCUB_DEVICE inline
void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD])
{
base_type::blocked_to_striped(input_items, output_items, temp_storage_);
}
template<typename OutputT>
HIPCUB_DEVICE inline
void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD])
{
base_type::warp_striped_to_blocked(input_items, output_items, temp_storage_);
}
template<typename OutputT>
HIPCUB_DEVICE inline
void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD])
{
base_type::blocked_to_warp_striped(input_items, output_items, temp_storage_);
}
template<typename OutputT, typename OffsetT>
HIPCUB_DEVICE inline
void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
OffsetT (&ranks)[ITEMS_PER_THREAD])
{
base_type::scatter_to_blocked(input_items, output_items, ranks, temp_storage_);
}
template<typename OutputT, typename OffsetT>
HIPCUB_DEVICE inline
void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
OffsetT (&ranks)[ITEMS_PER_THREAD])
{
base_type::scatter_to_striped(input_items, output_items, ranks, temp_storage_);
}
template<typename OutputT, typename OffsetT>
HIPCUB_DEVICE inline
void ScatterToStripedGuarded(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
OffsetT (&ranks)[ITEMS_PER_THREAD])
{
base_type::scatter_to_striped_guarded(input_items, output_items, ranks, temp_storage_);
}
template<typename OutputT, typename OffsetT, typename ValidFlag>
HIPCUB_DEVICE inline
void ScatterToStripedFlagged(InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
OffsetT (&ranks)[ITEMS_PER_THREAD],
ValidFlag (&is_valid)[ITEMS_PER_THREAD])
{
base_type::scatter_to_striped_flagged(input_items, output_items, ranks, is_valid, temp_storage_);
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
HIPCUB_DEVICE inline void StripedToBlocked(
InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
{
StripedToBlocked(items, items);
}
HIPCUB_DEVICE inline void BlockedToStriped(
InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
{
BlockedToStriped(items, items);
}
HIPCUB_DEVICE inline void WarpStripedToBlocked(
InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
{
WarpStripedToBlocked(items, items);
}
HIPCUB_DEVICE inline void BlockedToWarpStriped(
InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
{
BlockedToWarpStriped(items, items);
}
template <typename OffsetT>
HIPCUB_DEVICE inline void ScatterToBlocked(
InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
ScatterToBlocked(items, items, ranks);
}
template <typename OffsetT>
HIPCUB_DEVICE inline void ScatterToStriped(
InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
ScatterToStriped(items, items, ranks);
}
template <typename OffsetT>
HIPCUB_DEVICE inline void ScatterToStripedGuarded(
InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
ScatterToStripedGuarded(items, items, ranks);
}
template <typename OffsetT, typename ValidFlag>
HIPCUB_DEVICE inline void ScatterToStripedFlagged(
InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
ValidFlag (&is_valid)[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity
{
ScatterToStriped(items, items, ranks, is_valid);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
#include <type_traits>
#include <cub/rocprim/block/block_histogram.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
inline constexpr
typename std::underlying_type<::rocprim::block_histogram_algorithm>::type
to_BlockHistogramAlgorithm_enum(::rocprim::block_histogram_algorithm v)
{
using utype = std::underlying_type<::rocprim::block_histogram_algorithm>::type;
return static_cast<utype>(v);
}
}
enum BlockHistogramAlgorithm
{
BLOCK_HISTO_ATOMIC
= detail::to_BlockHistogramAlgorithm_enum(::rocprim::block_histogram_algorithm::using_atomic),
BLOCK_HISTO_SORT
= detail::to_BlockHistogramAlgorithm_enum(::rocprim::block_histogram_algorithm::using_sort)
};
template<
typename T,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
int BINS,
BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockHistogram
: private ::rocprim::block_histogram<
T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
BINS,
static_cast<::rocprim::block_histogram_algorithm>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_histogram<
T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
BINS,
static_cast<::rocprim::block_histogram_algorithm>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockHistogram() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockHistogram(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
template<class CounterT>
HIPCUB_DEVICE inline
void InitHistogram(CounterT histogram[BINS])
{
base_type::init_histogram(histogram);
}
template<class CounterT>
HIPCUB_DEVICE inline
void Composite(T (&items)[ITEMS_PER_THREAD],
CounterT histogram[BINS])
{
base_type::composite(items, histogram, temp_storage_);
}
template<class CounterT>
HIPCUB_DEVICE inline
void Histogram(T (&items)[ITEMS_PER_THREAD],
CounterT histogram[BINS])
{
base_type::init_histogram(histogram);
CTA_SYNC();
base_type::composite(items, histogram, temp_storage_);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
#include <type_traits>
#include "../config.hpp"
#include <cub/rocprim/block/block_load.hpp>
#include "block_load_func.cuh"
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
inline constexpr
typename std::underlying_type<::rocprim::block_load_method>::type
to_BlockLoadAlgorithm_enum(::rocprim::block_load_method v)
{
using utype = std::underlying_type<::rocprim::block_load_method>::type;
return static_cast<utype>(v);
}
}
enum BlockLoadAlgorithm
{
BLOCK_LOAD_DIRECT
= detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_direct),
BLOCK_LOAD_STRIPED
= detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_striped),
BLOCK_LOAD_VECTORIZE
= detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_vectorize),
BLOCK_LOAD_TRANSPOSE
= detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_transpose),
BLOCK_LOAD_WARP_TRANSPOSE
= detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_warp_transpose),
BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
= detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_warp_transpose)
};
template<
typename T,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockLoad
: private ::rocprim::block_load<
T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
static_cast<::rocprim::block_load_method>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_load<
T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
static_cast<::rocprim::block_load_method>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockLoad() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockLoad(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
template<class InputIteratorT>
HIPCUB_DEVICE inline
void Load(InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
base_type::load(block_iter, items, temp_storage_);
}
template<class InputIteratorT>
HIPCUB_DEVICE inline
void Load(InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
base_type::load(block_iter, items, valid_items, temp_storage_);
}
template<
class InputIteratorT,
class Default
>
HIPCUB_DEVICE inline
void Load(InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items,
Default oob_default)
{
base_type::load(block_iter, items, valid_items, oob_default, temp_storage_);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
#include "../config.hpp"
#include <cub/rocprim/block/block_load_func.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename T,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectBlocked(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_load_direct_blocked(
linear_id, block_iter, items
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectBlocked(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_load_direct_blocked(
linear_id, block_iter, items, valid_items
);
}
template<
typename T,
typename Default,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectBlocked(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items,
Default oob_default)
{
::rocprim::block_load_direct_blocked(
linear_id, block_iter, items, valid_items, oob_default
);
}
template <
typename T,
int ITEMS_PER_THREAD
>
HIPCUB_DEVICE inline
void LoadDirectBlockedVectorized(int linear_id,
T* block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_load_direct_blocked_vectorized(
linear_id, block_iter, items
);
}
template<
int BLOCK_THREADS,
typename T,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectStriped(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_load_direct_striped<BLOCK_THREADS>(
linear_id, block_iter, items
);
}
template<
int BLOCK_THREADS,
typename T,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectStriped(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_load_direct_striped<BLOCK_THREADS>(
linear_id, block_iter, items, valid_items
);
}
template<
int BLOCK_THREADS,
typename T,
typename Default,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectStriped(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items,
Default oob_default)
{
::rocprim::block_load_direct_striped<BLOCK_THREADS>(
linear_id, block_iter, items, valid_items, oob_default
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectWarpStriped(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_load_direct_warp_striped(
linear_id, block_iter, items
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectWarpStriped(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_load_direct_warp_striped(
linear_id, block_iter, items, valid_items
);
}
template<
typename T,
typename Default,
int ITEMS_PER_THREAD,
typename InputIteratorT
>
HIPCUB_DEVICE inline
void LoadDirectWarpStriped(int linear_id,
InputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items,
Default oob_default)
{
::rocprim::block_load_direct_warp_striped(
linear_id, block_iter, items, valid_items, oob_default
);
}
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_MERGE_SORT_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_MERGE_SORT_HPP_
#include "../thread/thread_sort.hpp"
#include "../util_math.cuh"
#include "../util_type.cuh"
#include <cub/rocprim/detail/various.hpp>
#include <cub/rocprim/functional.hpp>
BEGIN_HIPCUB_NAMESPACE
// Additional details of the Merge-Path Algorithm can be found in:
// S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, " Merge Path - Parallel
// Merging Made Simple", Multithreaded Architectures and Applications (MTAAP)
// Workshop, IEEE 26th International Parallel & Distributed Processing
// Symposium (IPDPS), 2012
template <typename KeyT,
typename KeyIteratorT,
typename OffsetT,
typename BinaryPred>
HIPCUB_DEVICE __forceinline__ OffsetT MergePath(KeyIteratorT keys1,
KeyIteratorT keys2,
OffsetT keys1_count,
OffsetT keys2_count,
OffsetT diag,
BinaryPred binary_pred)
{
OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
OffsetT keys1_end = (::rocprim::min)(diag, keys1_count);
while (keys1_begin < keys1_end)
{
OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
KeyT key1 = keys1[mid];
KeyT key2 = keys2[diag - 1 - mid];
bool pred = binary_pred(key2, key1);
if (pred)
{
keys1_end = mid;
}
else
{
keys1_begin = mid + 1;
}
}
return keys1_begin;
}
template <typename KeyT, typename CompareOp, int ITEMS_PER_THREAD>
HIPCUB_DEVICE __forceinline__ void SerialMerge(KeyT *keys_shared,
int keys1_beg,
int keys2_beg,
int keys1_count,
int keys2_count,
KeyT (&output)[ITEMS_PER_THREAD],
int (&indices)[ITEMS_PER_THREAD],
CompareOp compare_op)
{
int keys1_end = keys1_beg + keys1_count;
int keys2_end = keys2_beg + keys2_count;
KeyT key1 = keys_shared[keys1_beg];
KeyT key2 = keys_shared[keys2_beg];
#pragma unroll
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
{
bool p = (keys2_beg < keys2_end) &&
((keys1_beg >= keys1_end)
|| compare_op(key2, key1));
output[item] = p ? key2 : key1;
indices[item] = p ? keys2_beg++ : keys1_beg++;
if (p)
{
key2 = keys_shared[keys2_beg];
}
else
{
key1 = keys_shared[keys1_beg];
}
}
}
/**
* @brief Generalized merge sort algorithm
*
* This class is used to reduce code duplication. Warp and Block merge sort
* differ only in how they compute thread index and how they synchronize
* threads. Since synchronization might require access to custom data
* (like member mask), CRTP is used.
*
* @par
* The code snippet below illustrates the way this class can be used.
* @par
* @code
* #include <hipcub/hipcub.hpp> // or equivalently <hipcub/block/block_merge_sort.hpp>
*
* constexpr int BLOCK_THREADS = 256;
* constexpr int ITEMS_PER_THREAD = 9;
*
* class BlockMergeSort : public BlockMergeSortStrategy<int,
* cub::NullType,
* BLOCK_THREADS,
* ITEMS_PER_THREAD,
* BlockMergeSort>
* {
* using BlockMergeSortStrategyT =
* BlockMergeSortStrategy<int,
* cub::NullType,
* BLOCK_THREADS,
* ITEMS_PER_THREAD,
* BlockMergeSort>;
* public:
* __device__ __forceinline__ explicit BlockMergeSort(
* typename BlockMergeSortStrategyT::TempStorage &temp_storage)
* : BlockMergeSortStrategyT(temp_storage, threadIdx.x)
* {}
*
* __device__ __forceinline__ void SyncImplementation() const
* {
* __syncthreads();
* }
* };
* @endcode
*
* @tparam KeyT
* KeyT type
*
* @tparam ValueT
* ValueT type. cub::NullType indicates a keys-only sort
*
* @tparam SynchronizationPolicy
* Provides a way of synchronizing threads. Should be derived from
* `BlockMergeSortStrategy`.
*/
template <typename KeyT,
typename ValueT,
int NUM_THREADS,
int ITEMS_PER_THREAD,
typename SynchronizationPolicy>
class BlockMergeSortStrategy
{
static_assert(PowerOfTwo<NUM_THREADS>::VALUE,
"NUM_THREADS must be a power of two");
private:
static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * NUM_THREADS;
// Whether or not there are values to be trucked along with keys
static constexpr bool KEYS_ONLY = ::rocprim::Equals<ValueT, NullType>::VALUE;
/// Shared memory type required by this thread block
union _TempStorage
{
KeyT keys_shared[ITEMS_PER_TILE + 1];
ValueT items_shared[ITEMS_PER_TILE + 1];
}; // union TempStorage
/// Shared storage reference
_TempStorage &temp_storage;
/// Internal storage allocator
HIPCUB_DEVICE __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
const unsigned int linear_tid;
public:
/// \smemstorage{BlockMergeSort}
struct TempStorage : Uninitialized<_TempStorage> {};
BlockMergeSortStrategy() = delete;
explicit HIPCUB_DEVICE __forceinline__
BlockMergeSortStrategy(unsigned int linear_tid)
: temp_storage(PrivateStorage())
, linear_tid(linear_tid)
{}
HIPCUB_DEVICE __forceinline__ BlockMergeSortStrategy(TempStorage &temp_storage,
unsigned int linear_tid)
: temp_storage(temp_storage.Alias())
, linear_tid(linear_tid)
{}
HIPCUB_DEVICE __forceinline__ unsigned int get_linear_tid() const
{
return linear_tid;
}
/**
* @brief Sorts items partitioned across a CUDA thread block using
* a merge sorting method.
*
* @par
* Sort is not guaranteed to be stable. That is, suppose that i and j are
* equivalent: neither one is less than the other. It is not guaranteed
* that the relative order of these two elements will be preserved by sort.
*
* @tparam CompareOp
* functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
* `CompareOp` is a model of [Strict Weak Ordering].
*
* @param[in,out] keys
* Keys to sort
*
* @param[in] compare_op
* Comparison function object which returns true if the first argument is
* ordered before the second
*
* [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
*/
template <typename CompareOp>
HIPCUB_DEVICE __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
CompareOp compare_op)
{
ValueT items[ITEMS_PER_THREAD];
Sort<CompareOp, false>(keys, items, compare_op, ITEMS_PER_TILE, keys[0]);
}
/**
* @brief Sorts items partitioned across a CUDA thread block using
* a merge sorting method.
*
* @par
* - Sort is not guaranteed to be stable. That is, suppose that `i` and `j`
* are equivalent: neither one is less than the other. It is not guaranteed
* that the relative order of these two elements will be preserved by sort.
* - The value of `oob_default` is assigned to all elements that are out of
* `valid_items` boundaries. It's expected that `oob_default` is ordered
* after any value in the `valid_items` boundaries. The algorithm always
* sorts a fixed amount of elements, which is equal to
* `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
* after `oob_default`, it won't be placed within `valid_items` boundaries.
*
* @tparam CompareOp
* functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
* `CompareOp` is a model of [Strict Weak Ordering].
*
* @param[in,out] keys
* Keys to sort
*
* @param[in] compare_op
* Comparison function object which returns true if the first argument is
* ordered before the second
*
* @param[in] valid_items
* Number of valid items to sort
*
* @param[in] oob_default
* Default value to assign out-of-bound items
*
* [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
*/
template <typename CompareOp>
HIPCUB_DEVICE __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
CompareOp compare_op,
int valid_items,
KeyT oob_default)
{
ValueT items[ITEMS_PER_THREAD];
Sort<CompareOp, true>(keys, items, compare_op, valid_items, oob_default);
}
/**
* @brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
*
* @par
* Sort is not guaranteed to be stable. That is, suppose that `i` and `j` are
* equivalent: neither one is less than the other. It is not guaranteed
* that the relative order of these two elements will be preserved by sort.
*
* @tparam CompareOp
* functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
* `CompareOp` is a model of [Strict Weak Ordering].
*
* @param[in,out] keys
* Keys to sort
*
* @param[in,out] items
* Values to sort
*
* @param[in] compare_op
* Comparison function object which returns true if the first argument is
* ordered before the second
*
* [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
*/
template <typename CompareOp>
HIPCUB_DEVICE __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
ValueT (&items)[ITEMS_PER_THREAD],
CompareOp compare_op)
{
Sort<CompareOp, false>(keys, items, compare_op, ITEMS_PER_TILE, keys[0]);
}
/**
* @brief Sorts items partitioned across a CUDA thread block using
* a merge sorting method.
*
* @par
* - Sort is not guaranteed to be stable. That is, suppose that `i` and `j`
* are equivalent: neither one is less than the other. It is not guaranteed
* that the relative order of these two elements will be preserved by sort.
* - The value of `oob_default` is assigned to all elements that are out of
* `valid_items` boundaries. It's expected that `oob_default` is ordered
* after any value in the `valid_items` boundaries. The algorithm always
* sorts a fixed amount of elements, which is equal to
* `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
* after `oob_default`, it won't be placed within `valid_items` boundaries.
*
* @tparam CompareOp
* functor type having member `bool operator()(KeyT lhs, KeyT rhs)`
* `CompareOp` is a model of [Strict Weak Ordering].
*
* @tparam IS_LAST_TILE
* True if `valid_items` isn't equal to the `ITEMS_PER_TILE`
*
* @param[in,out] keys
* Keys to sort
*
* @param[in,out] items
* Values to sort
*
* @param[in] compare_op
* Comparison function object which returns true if the first argument is
* ordered before the second
*
* @param[in] valid_items
* Number of valid items to sort
*
* @param[in] oob_default
* Default value to assign out-of-bound items
*
* [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
*/
template <typename CompareOp,
bool IS_LAST_TILE = true>
HIPCUB_DEVICE __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
ValueT (&items)[ITEMS_PER_THREAD],
CompareOp compare_op,
int valid_items,
KeyT oob_default)
{
if (IS_LAST_TILE)
{
// if last tile, find valid max_key
// and fill the remaining keys with it
//
KeyT max_key = oob_default;
#pragma unroll
for (int item = 1; item < ITEMS_PER_THREAD; ++item)
{
if (ITEMS_PER_THREAD * static_cast<int>(linear_tid) + item < valid_items)
{
max_key = compare_op(max_key, keys[item]) ? keys[item] : max_key;
}
else
{
keys[item] = max_key;
}
}
}
// if first element of thread is in input range, stable sort items
//
if (!IS_LAST_TILE || ITEMS_PER_THREAD * static_cast<int>(linear_tid) < valid_items)
{
StableOddEvenSort(keys, items, compare_op);
}
// each thread has sorted keys
// merge sort keys in shared memory
//
#pragma unroll
for (int target_merged_threads_number = 2;
target_merged_threads_number <= NUM_THREADS;
target_merged_threads_number *= 2)
{
int merged_threads_number = target_merged_threads_number / 2;
int mask = target_merged_threads_number - 1;
Sync();
// store keys in shmem
//
#pragma unroll
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
{
int idx = ITEMS_PER_THREAD * linear_tid + item;
temp_storage.keys_shared[idx] = keys[item];
}
Sync();
int indices[ITEMS_PER_THREAD];
int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid;
int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged;
int size = ITEMS_PER_THREAD * merged_threads_number;
int thread_idx_in_thread_group_being_merged = mask & linear_tid;
int diag =
(::rocprim::min)(valid_items,
ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
int keys1_beg = (::rocprim::min)(valid_items, start);
int keys1_end = (::rocprim::min)(valid_items, keys1_beg + size);
int keys2_beg = keys1_end;
int keys2_end = (::rocprim::min)(valid_items, keys2_beg + size);
int keys1_count = keys1_end - keys1_beg;
int keys2_count = keys2_end - keys2_beg;
int partition_diag = MergePath<KeyT>(&temp_storage.keys_shared[keys1_beg],
&temp_storage.keys_shared[keys2_beg],
keys1_count,
keys2_count,
diag,
compare_op);
int keys1_beg_loc = keys1_beg + partition_diag;
int keys1_end_loc = keys1_end;
int keys2_beg_loc = keys2_beg + diag - partition_diag;
int keys2_end_loc = keys2_end;
int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
SerialMerge(&temp_storage.keys_shared[0],
keys1_beg_loc,
keys2_beg_loc,
keys1_count_loc,
keys2_count_loc,
keys,
indices,
compare_op);
if (!KEYS_ONLY)
{
Sync();
// store keys in shmem
//
#pragma unroll
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
{
int idx = ITEMS_PER_THREAD * linear_tid + item;
temp_storage.items_shared[idx] = items[item];
}
Sync();
// gather items from shmem
//
#pragma unroll
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
{
items[item] = temp_storage.items_shared[indices[item]];
}
}
}
} // func block_merge_sort
/**
* @brief Sorts items partitioned across a CUDA thread block using
* a merge sorting method.
*
* @par
* StableSort is stable: it preserves the relative ordering of equivalent
* elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
* and if the two elements are equivalent (neither `x < y` nor `y < x`) then
* a postcondition of StableSort is that `x` still precedes `y`.
*
* @tparam CompareOp
* functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
* `CompareOp` is a model of [Strict Weak Ordering].
*
* @param[in,out] keys
* Keys to sort
*
* @param[in] compare_op
* Comparison function object which returns true if the first argument is
* ordered before the second
*
* [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
*/
template <typename CompareOp>
HIPCUB_DEVICE __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
CompareOp compare_op)
{
Sort(keys, compare_op);
}
/**
* @brief Sorts items partitioned across a CUDA thread block using
* a merge sorting method.
*
* @par
* StableSort is stable: it preserves the relative ordering of equivalent
* elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
* and if the two elements are equivalent (neither `x < y` nor `y < x`) then
* a postcondition of StableSort is that `x` still precedes `y`.
*
* @tparam CompareOp
* functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
* `CompareOp` is a model of [Strict Weak Ordering].
*
* @param[in,out] keys
* Keys to sort
*
* @param[in,out] items
* Values to sort
*
* @param[in] compare_op
* Comparison function object which returns true if the first argument is
* ordered before the second
*
* [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
*/
template <typename CompareOp>
HIPCUB_DEVICE __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
ValueT (&items)[ITEMS_PER_THREAD],
CompareOp compare_op)
{
Sort(keys, items, compare_op);
}
/**
* @brief Sorts items partitioned across a CUDA thread block using
* a merge sorting method.
*
* @par
* - StableSort is stable: it preserves the relative ordering of equivalent
* elements. That is, if `x` and `y` are elements such that `x` precedes
* `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`)
* then a postcondition of StableSort is that `x` still precedes `y`.
* - The value of `oob_default` is assigned to all elements that are out of
* `valid_items` boundaries. It's expected that `oob_default` is ordered
* after any value in the `valid_items` boundaries. The algorithm always
* sorts a fixed amount of elements, which is equal to
* `ITEMS_PER_THREAD * BLOCK_THREADS`.
* If there is a value that is ordered after `oob_default`, it won't be
* placed within `valid_items` boundaries.
*
* @tparam CompareOp
* functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
* `CompareOp` is a model of [Strict Weak Ordering].
*
* @param[in,out] keys
* Keys to sort
*
* @param[in] compare_op
* Comparison function object which returns true if the first argument is
* ordered before the second
*
* @param[in] valid_items
* Number of valid items to sort
*
* @param[in] oob_default
* Default value to assign out-of-bound items
*
* [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
*/
template <typename CompareOp>
HIPCUB_DEVICE __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
CompareOp compare_op,
int valid_items,
KeyT oob_default)
{
Sort(keys, compare_op, valid_items, oob_default);
}
/**
* @brief Sorts items partitioned across a CUDA thread block using
* a merge sorting method.
*
* @par
* - StableSort is stable: it preserves the relative ordering of equivalent
* elements. That is, if `x` and `y` are elements such that `x` precedes
* `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`)
* then a postcondition of StableSort is that `x` still precedes `y`.
* - The value of `oob_default` is assigned to all elements that are out of
* `valid_items` boundaries. It's expected that `oob_default` is ordered
* after any value in the `valid_items` boundaries. The algorithm always
* sorts a fixed amount of elements, which is equal to
* `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
* after `oob_default`, it won't be placed within `valid_items` boundaries.
*
* @tparam CompareOp
* functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
* `CompareOp` is a model of [Strict Weak Ordering].
*
* @tparam IS_LAST_TILE
* True if `valid_items` isn't equal to the `ITEMS_PER_TILE`
*
* @param[in,out] keys
* Keys to sort
*
* @param[in,out] items
* Values to sort
*
* @param[in] compare_op
* Comparison function object which returns true if the first argument is
* ordered before the second
*
* @param[in] valid_items
* Number of valid items to sort
*
* @param[in] oob_default
* Default value to assign out-of-bound items
*
* [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
*/
template <typename CompareOp,
bool IS_LAST_TILE = true>
HIPCUB_DEVICE __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
ValueT (&items)[ITEMS_PER_THREAD],
CompareOp compare_op,
int valid_items,
KeyT oob_default)
{
Sort<CompareOp, IS_LAST_TILE>(keys,
items,
compare_op,
valid_items,
oob_default);
}
private:
HIPCUB_DEVICE __forceinline__ void Sync() const
{
static_cast<const SynchronizationPolicy*>(this)->SyncImplementation();
}
};
/**
* @brief The BlockMergeSort class provides methods for sorting items
* partitioned across a CUDA thread block using a merge sorting method.
* @ingroup BlockModule
*
* @tparam KeyT
* KeyT type
*
* @tparam BLOCK_DIM_X
* The thread block length in threads along the X dimension
*
* @tparam ITEMS_PER_THREAD
* The number of items per thread
*
* @tparam ValueT
* **[optional]** ValueT type (default: `cub::NullType`, which indicates
* a keys-only sort)
*
* @tparam BLOCK_DIM_Y
* **[optional]** The thread block length in threads along the Y dimension
* (default: 1)
*
* @tparam BLOCK_DIM_Z
* **[optional]** The thread block length in threads along the Z dimension
* (default: 1)
*
* @par Overview
* BlockMergeSort arranges items into ascending order using a comparison
* functor with less-than semantics. Merge sort can handle arbitrary types
* and comparison functors, but is slower than BlockRadixSort when sorting
* arithmetic types into ascending/descending order.
*
* @par A Simple Example
* @blockcollective{BlockMergeSort}
* @par
* The code snippet below illustrates a sort of 512 integer keys that are
* partitioned across 128 threads * where each thread owns 4 consecutive items.
* @par
* @code
* #include <hipcub/hipcub.hpp> // or equivalently <hipcub/block/block_merge_sort.hpp>
*
* struct CustomLess
* {
* template <typename DataType>
* __device__ bool operator()(const DataType &lhs, const DataType &rhs)
* {
* return lhs < rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockMergeSort for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockMergeSort<int, 128, 4> BlockMergeSort;
*
* // Allocate shared memory for BlockMergeSort
* __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_keys[4];
* ...
*
* BlockMergeSort(temp_storage_shuffle).Sort(thread_keys, CustomLess());
* ...
* }
* @endcode
* @par
* Suppose the set of input `thread_keys` across the block of threads is
* `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`.
* The corresponding output `thread_keys` in those threads will be
* `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
*
* @par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockMergeSort.
*/
template <typename KeyT,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
typename ValueT = NullType,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1>
class BlockMergeSort
: public BlockMergeSortStrategy<KeyT,
ValueT,
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
ITEMS_PER_THREAD,
BlockMergeSort<KeyT,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
ValueT,
BLOCK_DIM_Y,
BLOCK_DIM_Z>>
{
private:
// The thread block size in threads
static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * BLOCK_THREADS;
using BlockMergeSortStrategyT =
BlockMergeSortStrategy<KeyT,
ValueT,
BLOCK_THREADS,
ITEMS_PER_THREAD,
BlockMergeSort>;
public:
HIPCUB_DEVICE __forceinline__ BlockMergeSort()
: BlockMergeSortStrategyT(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
HIPCUB_DEVICE __forceinline__ explicit BlockMergeSort(
typename BlockMergeSortStrategyT::TempStorage &temp_storage)
: BlockMergeSortStrategyT(
temp_storage,
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
private:
HIPCUB_DEVICE __forceinline__ void SyncImplementation() const
{
CTA_SYNC();
}
friend BlockMergeSortStrategyT;
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_MERGE_SORT_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
*/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
#include <stdint.h>
#include "../config.hpp"
#include "../util_type.cuh"
#include "../util_ptx.cuh"
#include "../thread/thread_reduce.cuh"
#include "../thread/thread_scan.cuh"
#include "../block/block_scan.cuh"
#include "../block/radix_rank_sort_operations.hpp"
BEGIN_HIPCUB_NAMESPACE
/**
* \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
* \ingroup BlockModule
*
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam RADIX_BITS The number of radix bits per digit place
* \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low
* \tparam MEMOIZE_OUTER_SCAN <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
* \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
* \tparam SMEM_CONFIG <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam ARCH <b>[optional]</b> \ptxversion
*
* \par Overview
* Blah...
* - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
* - \blocked
*
* \par Performance Considerations
* - \granularity
*
* \par Examples
* \par
* - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
* \code
* #include <hipcub/hipcub.hpp>
*
* template <int BLOCK_THREADS>
* __global__ void ExampleKernel(...)
* {
*
* \endcode
*/
template <
int BLOCK_DIM_X,
int RADIX_BITS,
bool IS_DESCENDING,
bool MEMOIZE_OUTER_SCAN = false,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */>
class BlockRadixRank
{
private:
/******************************************************************************
* Type definitions and constants
******************************************************************************/
// Integer type for digit counters (to be packed into words of type PackedCounters)
typedef unsigned short DigitCounter;
// Integer type for packing DigitCounters into columns of shared memory banks
typedef typename std::conditional<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
unsigned long long,
unsigned int>::type PackedCounter;
enum
{
// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
RADIX_DIGITS = 1 << RADIX_BITS,
LOG_WARP_THREADS = Log2<ARCH>::VALUE,
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
BYTES_PER_COUNTER = sizeof(DigitCounter),
LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
LOG_COUNTER_LANES = rocprim::maximum<int>()((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), // Always at least one lane
COUNTER_LANES = 1 << LOG_COUNTER_LANES,
// The number of packed counters per thread (plus one for padding)
PADDED_COUNTER_LANES = COUNTER_LANES + 1,
RAKING_SEGMENT = PADDED_COUNTER_LANES,
};
public:
enum
{
/// Number of bin-starting offsets tracked per thread
BINS_TRACKED_PER_THREAD = rocprim::maximum<int>()(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
};
private:
/// BlockScan type
typedef BlockScan<
PackedCounter,
BLOCK_DIM_X,
INNER_SCAN_ALGORITHM,
BLOCK_DIM_Y,
BLOCK_DIM_Z,
ARCH>
BlockScan;
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// Shared memory storage layout type for BlockRadixRank
struct __align__(16) _TempStorage
{
union Aliasable
{
DigitCounter digit_counters[PADDED_COUNTER_LANES * BLOCK_THREADS * PACKING_RATIO];
PackedCounter raking_grid[BLOCK_THREADS * RAKING_SEGMENT];
} aliasable;
// Storage for scanning local ranks
typename BlockScan::TempStorage block_scan;
};
#endif
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
unsigned int linear_tid;
/// Copy of raking segment, promoted to registers
PackedCounter cached_segment[RAKING_SEGMENT];
/******************************************************************************
* Utility methods
******************************************************************************/
/**
* Internal storage allocator
*/
HIPCUB_DEVICE inline _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/**
* Performs upsweep raking reduction, returning the aggregate
*/
HIPCUB_DEVICE inline PackedCounter Upsweep()
{
PackedCounter *smem_raking_ptr = &temp_storage.aliasable.raking_grid[linear_tid * RAKING_SEGMENT];
PackedCounter *raking_ptr;
if (MEMOIZE_OUTER_SCAN)
{
// Copy data into registers
#pragma unroll
for (int i = 0; i < RAKING_SEGMENT; i++)
{
cached_segment[i] = smem_raking_ptr[i];
}
raking_ptr = cached_segment;
}
else
{
raking_ptr = smem_raking_ptr;
}
return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
}
/// Performs exclusive downsweep raking scan
HIPCUB_DEVICE inline void ExclusiveDownsweep(
PackedCounter raking_partial)
{
PackedCounter *smem_raking_ptr = &temp_storage.aliasable.raking_grid[linear_tid * RAKING_SEGMENT];
PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
cached_segment :
smem_raking_ptr;
// Exclusive raking downsweep scan
internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
if (MEMOIZE_OUTER_SCAN)
{
// Copy data back to smem
#pragma unroll
for (int i = 0; i < RAKING_SEGMENT; i++)
{
smem_raking_ptr[i] = cached_segment[i];
}
}
}
/**
* Reset shared memory digit counters
*/
HIPCUB_DEVICE inline void ResetCounters()
{
// Reset shared memory digit counters
#pragma unroll
for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
{
#pragma unroll
for (int SUB_COUNTER = 0; SUB_COUNTER < PACKING_RATIO; SUB_COUNTER++)
{
temp_storage.aliasable.digit_counters[(LANE * BLOCK_THREADS + linear_tid) * PACKING_RATIO + SUB_COUNTER] = 0;
}
}
}
/**
* Block-scan prefix callback
*/
struct PrefixCallBack
{
HIPCUB_DEVICE inline PackedCounter operator()(PackedCounter block_aggregate)
{
PackedCounter block_prefix = 0;
// Propagate totals in packed fields
#pragma unroll
for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
{
block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
}
return block_prefix;
}
};
/**
* Scan shared memory digit counters.
*/
HIPCUB_DEVICE inline void ScanCounters()
{
// Upsweep scan
PackedCounter raking_partial = Upsweep();
// Compute exclusive sum
PackedCounter exclusive_partial;
PrefixCallBack prefix_call_back;
BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
// Downsweep scan with exclusive partial
ExclusiveDownsweep(exclusive_partial);
}
public:
/// \smemstorage{BlockScan}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage.
*/
HIPCUB_DEVICE inline BlockRadixRank()
:
temp_storage(PrivateStorage()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage.
*/
HIPCUB_DEVICE inline BlockRadixRank(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Raking
*********************************************************************/
//@{
/**
* \brief Rank keys.
*/
template <
typename UnsignedBits,
int KEYS_PER_THREAD,
typename DigitExtractorT>
HIPCUB_DEVICE inline void RankKeys(
UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile
int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile
DigitExtractorT digit_extractor) ///< [in] The digit extractor
{
DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit
DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem
// Reset shared memory digit counters
ResetCounters();
#pragma unroll
for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
{
// Get digit
unsigned int digit = digit_extractor.Digit(keys[ITEM]);
// Get sub-counter
unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
// Get counter lane
unsigned int counter_lane = digit & (COUNTER_LANES - 1);
if (IS_DESCENDING)
{
sub_counter = PACKING_RATIO - 1 - sub_counter;
counter_lane = COUNTER_LANES - 1 - counter_lane;
}
// Pointer to smem digit counter
digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane * BLOCK_THREADS * PACKING_RATIO + linear_tid * PACKING_RATIO + sub_counter];
// Load thread-exclusive prefix
thread_prefixes[ITEM] = *digit_counters[ITEM];
// Store inclusive prefix
*digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
}
::rocprim::syncthreads();
// Scan shared memory counters
ScanCounters();
::rocprim::syncthreads();
// Extract the local ranks of each key
#pragma unroll
for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
{
// Add in thread block exclusive prefix
ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
}
}
/**
* \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
*/
template <
typename UnsignedBits,
int KEYS_PER_THREAD,
typename DigitExtractorT>
HIPCUB_DEVICE inline void RankKeys(
UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile
int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter)
DigitExtractorT digit_extractor, ///< [in] The digit extractor
int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
{
// Rank keys
RankKeys(keys, ranks, digit_extractor);
// Get the inclusive and exclusive digit totals corresponding to the calling thread.
#pragma unroll
for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
{
int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
{
if (IS_DESCENDING)
bin_idx = RADIX_DIGITS - bin_idx - 1;
// Obtain ex/inclusive digit counts. (Unfortunately these all reside in the
// first counter column, resulting in unavoidable bank conflicts.)
unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1));
unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES);
exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counter[counter_lane * BLOCK_THREADS * PACKING_RATIO + sub_counter];
}
}
}
};
/**
* Radix-rank using match.any
*/
template <
int BLOCK_DIM_X,
int RADIX_BITS,
bool IS_DESCENDING,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH>
class BlockRadixRankMatch
{
private:
/******************************************************************************
* Type definitions and constants
******************************************************************************/
typedef int32_t RankT;
typedef int32_t DigitCounterT;
enum
{
// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
RADIX_DIGITS = 1 << RADIX_BITS,
LOG_WARP_THREADS = Log2<ARCH>::VALUE,
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
PADDED_WARPS = ((WARPS & 0x1) == 0) ?
WARPS + 1 :
WARPS,
COUNTERS = PADDED_WARPS * RADIX_DIGITS,
RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ?
RAKING_SEGMENT + 1 :
RAKING_SEGMENT,
};
public:
enum
{
/// Number of bin-starting offsets tracked per thread
BINS_TRACKED_PER_THREAD = rocprim::maximum<int>()(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
};
private:
/// BlockScan type
typedef BlockScan<
DigitCounterT,
BLOCK_THREADS,
INNER_SCAN_ALGORITHM,
BLOCK_DIM_Y,
BLOCK_DIM_Z,
ARCH>
BlockScanT;
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// Shared memory storage layout type for BlockRadixRank
struct __align__(16) _TempStorage
{
typename BlockScanT::TempStorage block_scan;
union __align__(16) Aliasable
{
volatile DigitCounterT warp_digit_counters[RADIX_DIGITS * PADDED_WARPS];
DigitCounterT raking_grid[BLOCK_THREADS * PADDED_RAKING_SEGMENT];
} aliasable;
};
#endif
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
unsigned int linear_tid;
public:
/// \smemstorage{BlockScan}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using the specified memory allocation as temporary storage.
*/
HIPCUB_DEVICE inline BlockRadixRankMatch(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Raking
*********************************************************************/
//@{
/**
* \brief Rank keys.
*/
template <
typename UnsignedBits,
int KEYS_PER_THREAD,
typename DigitExtractorT>
__device__ __forceinline__ void RankKeys(
UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile
int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile
DigitExtractorT digit_extractor) ///< [in] The digit extractor
{
// Initialize shared digit counters
#pragma unroll
for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM] = 0;
::rocprim::syncthreads();
// Each warp will strip-mine its section of input, one strip at a time
volatile DigitCounterT *digit_counters[KEYS_PER_THREAD];
uint32_t warp_id = linear_tid >> LOG_WARP_THREADS;
uint32_t lane_mask_lt = LaneMaskLt();
#pragma unroll
for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
{
// My digit
uint32_t digit = digit_extractor.Digit(keys[ITEM]);
if (IS_DESCENDING)
digit = RADIX_DIGITS - digit - 1;
// Mask of peers who have same digit as me
uint32_t peer_mask = rocprim::MatchAny<RADIX_BITS>(digit);
// Pointer to smem digit counter for this key
digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit * PADDED_WARPS + warp_id];
// Number of occurrences in previous strips
DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
// Warp-sync
WARP_SYNC(0xFFFFFFFF);
// Number of peers having same digit as me
int32_t digit_count = __popc(peer_mask);
// Number of lower-ranked peers having same digit seen so far
int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
if (peer_digit_prefix == 0)
{
// First thread for each digit updates the shared warp counter
*digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
}
// Warp-sync
WARP_SYNC(0xFFFFFFFF);
// Number of prior keys having same digit
ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
}
::rocprim::syncthreads();
// Scan warp counters
DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
#pragma unroll
for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM];
BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
#pragma unroll
for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM] = scan_counters[ITEM];
::rocprim::syncthreads();
// Seed ranks with counter values from previous warps
#pragma unroll
for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
ranks[ITEM] += *digit_counters[ITEM];
}
/**
* \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
*/
template <
typename UnsignedBits,
int KEYS_PER_THREAD,
typename DigitExtractorT>
__device__ __forceinline__ void RankKeys(
UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile
int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter)
DigitExtractorT digit_extractor, ///< [in] The digit extractor
int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
{
RankKeys(keys, ranks, digit_extractor);
// Get exclusive count for each digit
#pragma unroll
for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
{
int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
{
if (IS_DESCENDING)
bin_idx = RADIX_DIGITS - bin_idx - 1;
exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx * PADDED_WARPS];
}
}
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/functional.hpp>
#include <cub/rocprim/block/block_radix_sort.hpp>
#include "block_scan.cuh"
BEGIN_HIPCUB_NAMESPACE
template<
typename KeyT,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
typename ValueT = NullType,
int RADIX_BITS = 4, /* ignored */
bool MEMOIZE_OUTER_SCAN = true, /* ignored */
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, /* ignored */
cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, /* ignored */
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = HIPCUB_ARCH /* ignored */
>
class BlockRadixSort
: private ::rocprim::block_radix_sort<
KeyT,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
ValueT,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_radix_sort<
KeyT,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
ValueT,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockRadixSort() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockRadixSort(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
HIPCUB_DEVICE inline
void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8)
{
base_type::sort(keys, temp_storage_, begin_bit, end_bit);
}
HIPCUB_DEVICE inline
void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
ValueT (&values)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8)
{
base_type::sort(keys, values, temp_storage_, begin_bit, end_bit);
}
HIPCUB_DEVICE inline
void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8)
{
base_type::sort_desc(keys, temp_storage_, begin_bit, end_bit);
}
HIPCUB_DEVICE inline
void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
ValueT (&values)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8)
{
base_type::sort_desc(keys, values, temp_storage_, begin_bit, end_bit);
}
HIPCUB_DEVICE inline
void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8)
{
base_type::sort_to_striped(keys, temp_storage_, begin_bit, end_bit);
}
HIPCUB_DEVICE inline
void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
ValueT (&values)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8)
{
base_type::sort_to_striped(keys, values, temp_storage_, begin_bit, end_bit);
}
HIPCUB_DEVICE inline
void SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8)
{
base_type::sort_desc_to_striped(keys, temp_storage_, begin_bit, end_bit);
}
HIPCUB_DEVICE inline
void SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
ValueT (&values)[ITEMS_PER_THREAD],
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8)
{
base_type::sort_desc_to_striped(keys, values, temp_storage_, begin_bit, end_bit);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
*/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RAKING_LAYOUT_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RAKING_LAYOUT_HPP_
#include <type_traits>
#include "../config.hpp"
#include <cub/rocprim/config.hpp>
#include <cub/rocprim/detail/various.hpp>
BEGIN_HIPCUB_NAMESPACE
/**
* \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.
* \ingroup BlockModule
*
* \par Overview
* This type facilitates a shared memory usage pattern where a block of CUDA
* threads places elements into shared memory and then reduces the active
* parallelism to one "raking" warp of threads for serially aggregating consecutive
* sequences of shared items. Padding is inserted to eliminate bank conflicts
* (for most data types).
*
* \tparam T The data type to be exchanged.
* \tparam BLOCK_THREADS The thread block size in threads.
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
*/
template <
typename T,
int BLOCK_THREADS,
int ARCH = HIPCUB_ARCH /* ignored */
>
struct block_raking_layout
{
//---------------------------------------------------------------------
// Constants and type definitions
//---------------------------------------------------------------------
enum
{
/// The total number of elements that need to be cooperatively reduced
SHARED_ELEMENTS = BLOCK_THREADS,
/// Maximum number of warp-synchronous raking threads
MAX_RAKING_THREADS = ::rocprim::detail::get_min_warp_size(BLOCK_THREADS, HIPCUB_DEVICE_WARP_THREADS),
/// Number of raking elements per warp-synchronous raking thread (rounded up)
SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
/// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
/// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
/// Total number of elements in the raking grid
GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
/// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* \brief Shared memory storage type
*/
struct __align__(16) _TempStorage
{
T buff[BlockRakingLayout::GRID_ELEMENTS];
};
#endif
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
/**
* \brief Returns the location for the calling thread to place data into the grid
*/
static HIPCUB_DEVICE inline T* PlacementPtr(
TempStorage &temp_storage,
unsigned int linear_tid)
{
// Offset for partial
unsigned int offset = linear_tid;
// Add in one padding element for every segment
if (USE_SEGMENT_PADDING > 0)
{
offset += offset / SEGMENT_LENGTH;
}
// Incorporating a block of padding partials every shared memory segment
return temp_storage.Alias().buff + offset;
}
/**
* \brief Returns the location for the calling thread to begin sequential raking
*/
static HIPCUB_DEVICE inline T* RakingPtr(
TempStorage &temp_storage,
unsigned int linear_tid)
{
return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RAKING_LAYOUT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
#include <type_traits>
#include <cub/rocprim/block/block_reduce.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
inline constexpr
typename std::underlying_type<::rocprim::block_reduce_algorithm>::type
to_BlockReduceAlgorithm_enum(::rocprim::block_reduce_algorithm v)
{
using utype = std::underlying_type<::rocprim::block_reduce_algorithm>::type;
return static_cast<utype>(v);
}
}
enum BlockReduceAlgorithm
{
BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
= detail::to_BlockReduceAlgorithm_enum(::rocprim::block_reduce_algorithm::raking_reduce_commutative_only),
BLOCK_REDUCE_RAKING
= detail::to_BlockReduceAlgorithm_enum(::rocprim::block_reduce_algorithm::raking_reduce),
BLOCK_REDUCE_WARP_REDUCTIONS
= detail::to_BlockReduceAlgorithm_enum(::rocprim::block_reduce_algorithm::using_warp_reduce)
};
template<
typename T,
int BLOCK_DIM_X,
BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockReduce
: private ::rocprim::block_reduce<
T,
BLOCK_DIM_X,
static_cast<::rocprim::block_reduce_algorithm>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_reduce<
T,
BLOCK_DIM_X,
static_cast<::rocprim::block_reduce_algorithm>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockReduce() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockReduce(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
HIPCUB_DEVICE inline
T Sum(T input)
{
base_type::reduce(input, input, temp_storage_);
return input;
}
HIPCUB_DEVICE inline
T Sum(T input, int valid_items)
{
base_type::reduce(input, input, valid_items, temp_storage_);
return input;
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
T Sum(T(&input)[ITEMS_PER_THREAD])
{
T output;
base_type::reduce(input, output, temp_storage_);
return output;
}
template<typename ReduceOp>
HIPCUB_DEVICE inline
T Reduce(T input, ReduceOp reduce_op)
{
base_type::reduce(input, input, temp_storage_, reduce_op);
return input;
}
template<typename ReduceOp>
HIPCUB_DEVICE inline
T Reduce(T input, ReduceOp reduce_op, int valid_items)
{
base_type::reduce(input, input, valid_items, temp_storage_, reduce_op);
return input;
}
template<int ITEMS_PER_THREAD, typename ReduceOp>
HIPCUB_DEVICE inline
T Reduce(T(&input)[ITEMS_PER_THREAD], ReduceOp reduce_op)
{
T output;
base_type::reduce(input, output, temp_storage_, reduce_op);
return output;
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
#include "../config.hpp"
#include "../thread/thread_search.cuh"
#include "../util_math.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "block_scan.cuh"
#include <limits>
#include <type_traits>
BEGIN_HIPCUB_NAMESPACE
/**
* \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
* the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
* array.
* Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
* array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
* retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
* DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
*
* \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
* A run of length zero may not be followed by a run length that is not zero.
*
* \par
* \code
* __global__ void ExampleKernel(...)
* {
* // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
* using RunItemT = uint64_t;
* // Type large enough to index into the run-length decoded array
* using RunLengthT = uint32_t;
*
* // Specialising BlockRunLengthDecode for a 1D block of 128 threads
* constexpr int BLOCK_DIM_X = 128;
* // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
* constexpr int RUNS_PER_THREAD = 2;
* // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
* constexpr int DECODED_ITEMS_PER_THREAD = 4;
*
* // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
* using BlockRunLengthDecodeT =
* cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
*
* // Allocate shared memory for BlockRunLengthDecode
* __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
*
* // The run-length encoded items and how often they shall be repeated in the run-length decoded output
* RunItemT run_values[RUNS_PER_THREAD];
* RunLengthT run_lengths[RUNS_PER_THREAD];
* ...
*
* // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
* uint32_t total_decoded_size = 0;
* BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
*
* // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
* // have been decoded.
* uint32_t decoded_window_offset = 0U;
* while (decoded_window_offset < total_decoded_size)
* {
* RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
* RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
*
* // The number of decoded items that are valid within this window (aka pass) of run-length decoding
* uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
* block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
*
* decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
*
* ...
* }
* }
* \endcode
* \par
* Suppose the set of input \p run_values across the block of threads is
* <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
* \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
* The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
* [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
* 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
*
* \tparam ItemT The data type of the items being run-length decoded
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
* \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
* \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
* runs' lengths)
* \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
* \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
*/
template <typename ItemT,
int BLOCK_DIM_X,
int RUNS_PER_THREAD,
int DECODED_ITEMS_PER_THREAD,
typename DecodedOffsetT = uint32_t,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1>
class BlockRunLengthDecode
{
//---------------------------------------------------------------------
// CONFIGS & TYPE ALIASES
//---------------------------------------------------------------------
private:
/// The thread block size in threads
static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
/// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
/// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_WARP_SCANS, BLOCK_DIM_Y, BLOCK_DIM_Z>;
/// Type used to index into the block's runs
using RunOffsetT = uint32_t;
/// Shared memory type required by this thread block
union _TempStorage
{
typename RunOffsetScanT::TempStorage offset_scan;
struct
{
ItemT run_values[BLOCK_RUNS];
DecodedOffsetT run_offsets[BLOCK_RUNS];
} runs;
}; // union TempStorage
/// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
HIPCUB_DEVICE __forceinline__ _TempStorage &PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
uint32_t linear_tid;
public:
struct TempStorage : Uninitialized<_TempStorage>
{
};
//---------------------------------------------------------------------
// CONSTRUCTOR
//---------------------------------------------------------------------
/**
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent
* <b>RunLengthDecode</b> calls.
*/
template <typename RunLengthT, typename TotalDecodedSizeT>
HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
: temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunLengths(run_values, run_lengths, total_decoded_size);
}
/**
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent
* <b>RunLengthDecode</b> calls.
*/
template <typename UserRunOffsetT>
HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
ItemT (&run_values)[RUNS_PER_THREAD],
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
: temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunOffsets(run_values, run_offsets);
}
/**
* \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
*/
template <typename RunLengthT, typename TotalDecodedSizeT>
HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
: temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunLengths(run_values, run_lengths, total_decoded_size);
}
/**
* \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
*/
template <typename UserRunOffsetT>
HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
: temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunOffsets(run_values, run_offsets);
}
private:
/**
* \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
* \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
* iterations at compile time.
*/
template <int MAX_NUM_ITEMS,
typename InputIteratorT,
typename OffsetT,
typename T>
HIPCUB_DEVICE __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
OffsetT num_items, ///< [in] Input sequence length
T val) ///< [in] Search key
{
OffsetT lower_bound = 0;
OffsetT upper_bound = num_items;
#pragma unroll
for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
{
OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
mid = (rocprim::min)(mid, num_items - 1);
if (val < input[mid])
{
upper_bound = mid;
}
else
{
lower_bound = mid + 1;
}
}
return lower_bound;
}
template <typename RunOffsetT>
HIPCUB_DEVICE __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
{
// Keep the runs' items and the offsets of each run's beginning in the temporary storage
RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
#pragma unroll
for (int i = 0; i < RUNS_PER_THREAD; i++)
{
temp_storage.runs.run_values[thread_dst_offset] = run_values[i];
temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
thread_dst_offset++;
}
// Ensure run offsets and run values have been writen to shared memory
CTA_SYNC();
}
template <typename RunLengthT, typename TotalDecodedSizeT>
HIPCUB_DEVICE __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
{
// Compute the offset for the beginning of each run
DecodedOffsetT run_offsets[RUNS_PER_THREAD];
#pragma unroll
for (int i = 0; i < RUNS_PER_THREAD; i++)
{
run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
}
DecodedOffsetT decoded_size_aggregate;
RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
// Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
CTA_SYNC();
InitWithRunOffsets(run_values, run_offsets);
}
public:
/**
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
* <b>RunLengthDecode</b> is not required.
* \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
* run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
* decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
* \smemreuse
*
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
* \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
* in undefined behavior.
*/
template <typename RelativeOffsetT>
HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
DecodedOffsetT from_decoded_offset = 0)
{
// The (global) offset of the first item decoded by this thread
DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
// The run that the first decoded item of this thread belongs to
// If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
// last run
RunOffsetT assigned_run =
StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
static_cast<RunOffsetT>(1U);
DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
: temp_storage.runs.run_offsets[assigned_run + 1];
ItemT val = temp_storage.runs.run_values[assigned_run];
#pragma unroll
for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
{
decoded_items[i] = val;
item_offsets[i] = thread_decoded_offset - assigned_run_begin;
if (thread_decoded_offset == assigned_run_end - 1)
{
// We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
// extending the last run's length to all the thread's item
assigned_run++;
assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
: temp_storage.runs.run_offsets[assigned_run + 1];
val = temp_storage.runs.run_values[assigned_run];
}
thread_decoded_offset++;
}
}
/**
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
* <b>RunLengthDecode</b> is not required.
*
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
* in undefined behavior.
*/
HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
DecodedOffsetT from_decoded_offset = 0)
{
DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/block/block_scan.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
inline constexpr
typename std::underlying_type<::rocprim::block_scan_algorithm>::type
to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm v)
{
using utype = std::underlying_type<::rocprim::block_scan_algorithm>::type;
return static_cast<utype>(v);
}
}
enum BlockScanAlgorithm
{
BLOCK_SCAN_RAKING
= detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::reduce_then_scan),
BLOCK_SCAN_RAKING_MEMOIZE
= detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::reduce_then_scan),
BLOCK_SCAN_WARP_SCANS
= detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::using_warp_scan)
};
template<
typename T,
int BLOCK_DIM_X,
BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockScan
: private ::rocprim::block_scan<
T,
BLOCK_DIM_X,
static_cast<::rocprim::block_scan_algorithm>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_scan<
T,
BLOCK_DIM_X,
static_cast<::rocprim::block_scan_algorithm>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockScan() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockScan(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
HIPCUB_DEVICE inline
void InclusiveSum(T input, T& output)
{
base_type::inclusive_scan(input, output, temp_storage_);
}
HIPCUB_DEVICE inline
void InclusiveSum(T input, T& output, T& block_aggregate)
{
base_type::inclusive_scan(input, output, block_aggregate, temp_storage_);
}
template<typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::inclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
);
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
{
base_type::inclusive_scan(input, output, temp_storage_);
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T& block_aggregate)
{
base_type::inclusive_scan(input, output, block_aggregate, temp_storage_);
}
template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::inclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
);
}
template<typename ScanOp>
HIPCUB_DEVICE inline
void InclusiveScan(T input, T& output, ScanOp scan_op)
{
base_type::inclusive_scan(input, output, temp_storage_, scan_op);
}
template<typename ScanOp>
HIPCUB_DEVICE inline
void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
{
base_type::inclusive_scan(input, output, block_aggregate, temp_storage_, scan_op);
}
template<typename ScanOp, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::inclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, scan_op
);
}
template<int ITEMS_PER_THREAD, typename ScanOp>
HIPCUB_DEVICE inline
void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
{
base_type::inclusive_scan(input, output, temp_storage_, scan_op);
}
template<int ITEMS_PER_THREAD, typename ScanOp>
HIPCUB_DEVICE inline
void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
ScanOp scan_op, T& block_aggregate)
{
base_type::inclusive_scan(input, output, block_aggregate, temp_storage_, scan_op);
}
template<int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::inclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, scan_op
);
}
HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output)
{
base_type::exclusive_scan(input, output, T(0), temp_storage_);
}
HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output, T& block_aggregate)
{
base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
}
template<typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::exclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
);
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
{
base_type::exclusive_scan(input, output, T(0), temp_storage_);
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T& block_aggregate)
{
base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
}
template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::exclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
);
}
template<typename ScanOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
{
base_type::exclusive_scan(input, output, initial_value, temp_storage_, scan_op);
}
template<typename ScanOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T input, T& output, T initial_value,
ScanOp scan_op, T& block_aggregate)
{
base_type::exclusive_scan(
input, output, initial_value, block_aggregate, temp_storage_, scan_op
);
}
template<typename ScanOp, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T input, T& output, ScanOp scan_op,
BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::exclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, scan_op
);
}
template<int ITEMS_PER_THREAD, typename ScanOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T initial_value, ScanOp scan_op)
{
base_type::exclusive_scan(input, output, initial_value, temp_storage_, scan_op);
}
template<int ITEMS_PER_THREAD, typename ScanOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T initial_value, ScanOp scan_op, T& block_aggregate)
{
base_type::exclusive_scan(
input, output, initial_value, block_aggregate, temp_storage_, scan_op
);
}
template<int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::exclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, scan_op
);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/block/block_shuffle.hpp>
BEGIN_HIPCUB_NAMESPACE
template <
typename T,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH>
class BlockShuffle : public ::rocprim::block_shuffle<
T,
BLOCK_DIM_X,
BLOCK_DIM_Y,
BLOCK_DIM_Z>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_shuffle<
T,
BLOCK_DIM_X,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockShuffle() : temp_storage_(private_storage())
{}
HIPCUB_DEVICE inline
BlockShuffle(TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
: temp_storage_(temp_storage)
{}
/**
* \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
*
* \par
* - \smemreuse
*/
HIPCUB_DEVICE inline void Offset(
T input, ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
T& output, ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input). This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
int distance = 1) ///< [in] Offset distance (may be negative)
{
base_type::offset(input,output,distance);
}
/**
* \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
*
* \par
* - \smemreuse
*/
HIPCUB_DEVICE inline void Rotate(
T input, ///< [in] The calling thread's input item
T& output, ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input). This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
{
base_type::rotate(input,output,distance);
}
/**
* \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
*
* \par
* - \blocked
* - \granularity
* - \smemreuse
*/
template <int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline void Up(
T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items
T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
{
base_type::up(input,prev);
}
/**
* \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
*
* \par
* - \blocked
* - \granularity
* - \smemreuse
*/
template <int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline void Up(
T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items
T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
{
base_type::up(input,prev,block_suffix);
}
/**
* \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
*
* \par
* - \blocked
* - \granularity
* - \smemreuse
*/
template <int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline void Down(
T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items
T (&next)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p next[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
{
base_type::down(input,next);
}
/**
* \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
*
* \par
* - \blocked
* - \granularity
* - \smemreuse
*/
template <int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline void Down(
T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items
T (&next)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p next[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
T &block_prefix) ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
{
base_type::down(input,next,block_prefix);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "block_store_func.hpp"
#include <cub/rocprim/block/block_store.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
inline constexpr
typename std::underlying_type<::rocprim::block_store_method>::type
to_BlockStoreAlgorithm_enum(::rocprim::block_store_method v)
{
using utype = std::underlying_type<::rocprim::block_store_method>::type;
return static_cast<utype>(v);
}
}
enum BlockStoreAlgorithm
{
BLOCK_STORE_DIRECT
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_direct),
BLOCK_STORE_STRIPED
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_striped),
BLOCK_STORE_VECTORIZE
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_vectorize),
BLOCK_STORE_TRANSPOSE
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_transpose),
BLOCK_STORE_WARP_TRANSPOSE
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_warp_transpose),
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_warp_transpose)
};
template<
typename T,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockStore
: private ::rocprim::block_store<
T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
static_cast<::rocprim::block_store_method>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_store<
T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
static_cast<::rocprim::block_store_method>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockStore() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockStore(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
template<class OutputIteratorT>
HIPCUB_DEVICE inline
void Store(OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
base_type::store(block_iter, items, temp_storage_);
}
template<class OutputIteratorT>
HIPCUB_DEVICE inline
void Store(OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
base_type::store(block_iter, items, valid_items, temp_storage_);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
#include "../config.hpp"
#include <cub/rocprim/block/block_store_func.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectBlocked(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_store_direct_blocked(
linear_id, block_iter, items
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectBlocked(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_store_direct_blocked(
linear_id, block_iter, items, valid_items
);
}
template <
typename T,
int ITEMS_PER_THREAD
>
HIPCUB_DEVICE inline
void StoreDirectBlockedVectorized(int linear_id,
T* block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_store_direct_blocked_vectorized(
linear_id, block_iter, items
);
}
template<
int BLOCK_THREADS,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectStriped(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_store_direct_striped<BLOCK_THREADS>(
linear_id, block_iter, items
);
}
template<
int BLOCK_THREADS,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectStriped(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_store_direct_striped<BLOCK_THREADS>(
linear_id, block_iter, items, valid_items
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectWarpStriped(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_store_direct_warp_striped(
linear_id, block_iter, items
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectWarpStriped(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_store_direct_warp_striped(
linear_id, block_iter, items, valid_items
);
}
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
/******************************************************************************
* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* radix_rank_sort_operations.cuh contains common abstractions, definitions and
* operations used for radix sorting and ranking.
*/
#ifndef HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
#define HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
#include <type_traits>
#include "../config.hpp"
#include <cub/rocprim/config.hpp>
#include <cub/rocprim/type_traits.hpp>
#include <cub/rocprim/detail/various.hpp>
BEGIN_HIPCUB_NAMESPACE
/** \brief Twiddling keys for radix sort. */
template <bool IS_DESCENDING, typename KeyT>
struct RadixSortTwiddle
{
typedef Traits<KeyT> TraitsT;
typedef typename TraitsT::UnsignedBits UnsignedBits;
static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits In(UnsignedBits key)
{
key = TraitsT::TwiddleIn(key);
if (IS_DESCENDING) key = ~key;
return key;
}
static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits Out(UnsignedBits key)
{
if (IS_DESCENDING) key = ~key;
key = TraitsT::TwiddleOut(key);
return key;
}
static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits DefaultKey()
{
return Out(~UnsignedBits(0));
}
};
/** \brief Base struct for digit extractor. Contains common code to provide
special handling for floating-point -0.0.
\note This handles correctly both the case when the keys are
bitwise-complemented after twiddling for descending sort (in onesweep) as
well as when the keys are not bit-negated, but the implementation handles
descending sort separately (in other implementations in CUB). Twiddling
alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are
subsequent bit patterns and bitwise complements of each other. For onesweep,
both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for
ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending
sort. For all other sorting implementations in CUB, both are always mapped
to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other
and only one of them is used, the sorting works correctly. For double, the
same applies, but with 64-bit patterns.
*/
template <typename KeyT>
struct BaseDigitExtractor
{
typedef Traits<KeyT> TraitsT;
typedef typename TraitsT::UnsignedBits UnsignedBits;
enum
{
FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT,
};
static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
{
if (!FLOAT_KEY) {
return key;
} else {
UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
}
}
};
/** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a
* key from a digit. */
template <typename KeyT>
struct BFEDigitExtractor : BaseDigitExtractor<KeyT>
{
using typename BaseDigitExtractor<KeyT>::UnsignedBits;
uint32_t bit_start, num_bits;
explicit __device__ __forceinline__ BFEDigitExtractor(
uint32_t bit_start = 0, uint32_t num_bits = 0)
: bit_start(bit_start), num_bits(num_bits)
{ }
__device__ __forceinline__ uint32_t Digit(UnsignedBits key)
{
return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits);
}
};
/** \brief A wrapper type to extract digits. Uses a combination of shift and
* bitwise and to extract digits. */
template <typename KeyT>
struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
{
using typename BaseDigitExtractor<KeyT>::UnsignedBits;
uint32_t bit_start, mask;
explicit __device__ __forceinline__ ShiftDigitExtractor(
uint32_t bit_start = 0, uint32_t num_bits = 0)
: bit_start(bit_start), mask((1 << num_bits) - 1)
{ }
__device__ __forceinline__ uint32_t Digit(UnsignedBits key)
{
return uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask;
}
};
END_HIPCUB_NAMESPACE
#endif //HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_CONFIG_HPP_
#define HIPCUB_CONFIG_HPP_
#include <cuda_runtime.h>
#define HIPCUB_NAMESPACE cub
#define BEGIN_HIPCUB_NAMESPACE \
namespace cub {
#define END_HIPCUB_NAMESPACE \
} /* hipcub */
#ifndef HIPCUB_ARCH
#define HIPCUB_ARCH 1
#endif
#define CUB_DEVICE_WARP_THREADS 64
#ifdef __CUDACC__
#define HIPCUB_ROCPRIM_API 1
#define HIPCUB_RUNTIME_FUNCTION __host__
#elif defined(__HIP_PLATFORM_NVIDIA__)
#define HIPCUB_CUB_API 1
#define HIPCUB_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION
#include <cub/util_arch.cuh>
#define HIPCUB_WARP_THREADS CUB_PTX_WARP_THREADS
#define HIPCUB_DEVICE_WARP_THREADS CUB_PTX_WARP_THREADS
#define HIPCUB_HOST_WARP_THREADS CUB_PTX_WARP_THREADS
#define HIPCUB_ARCH CUB_PTX_ARCH
BEGIN_HIPCUB_NAMESPACE
using namespace cub;
END_HIPCUB_NAMESPACE
#endif
/// Supported warp sizes
#define HIPCUB_WARP_SIZE_32 32u
#define HIPCUB_WARP_SIZE_64 64u
#define HIPCUB_MAX_WARP_SIZE HIPCUB_WARP_SIZE_64
#define HIPCUB_HOST __host__
#define HIPCUB_DEVICE __device__
#define HIPCUB_HOST_DEVICE __host__ __device__
#define HIPCUB_SHARED_MEMORY __shared__
// Helper macros to disable warnings in clang
#ifdef __clang__
#define HIPCUB_PRAGMA_TO_STR(x) _Pragma(#x)
#define HIPCUB_CLANG_SUPPRESS_WARNING_PUSH _Pragma("clang diagnostic push")
#define HIPCUB_CLANG_SUPPRESS_WARNING(w) HIPCUB_PRAGMA_TO_STR(clang diagnostic ignored w)
#define HIPCUB_CLANG_SUPPRESS_WARNING_POP _Pragma("clang diagnostic pop")
#define HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH(w) \
HIPCUB_CLANG_SUPPRESS_WARNING_PUSH HIPCUB_CLANG_SUPPRESS_WARNING(w)
#else // __clang__
#define HIPCUB_CLANG_SUPPRESS_WARNING_PUSH
#define HIPCUB_CLANG_SUPPRESS_WARNING(w)
#define HIPCUB_CLANG_SUPPRESS_WARNING_POP
#define HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH(w)
#endif // __clang__
BEGIN_HIPCUB_NAMESPACE
/// hipCUB error reporting macro (prints error messages to stderr)
#if (defined(DEBUG) || defined(_DEBUG)) && !defined(HIPCUB_STDERR)
#define HIPCUB_STDERR
#endif
inline
cudaError_t Debug(
cudaError_t error,
const char* filename,
int line)
{
(void)filename;
(void)line;
#ifdef HIPCUB_STDERR
if (error)
{
fprintf(stderr, "cuda error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
fflush(stderr);
}
#endif
return error;
}
#ifndef cubDebug
#define cubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_CONFIG_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_HIPCUB_HPP_
#define HIPCUB_ROCPRIM_HIPCUB_HPP_
#include "config.hpp"
#include "version.cuh"
#include "util_allocator.cuh"
#include "util_type.cuh"
#include "util_ptx.cuh"
#include "thread/thread_operators.cuh"
// Iterator
#include "iterator/arg_index_input_iterator.cuh"
#include "iterator/cache_modified_input_iterator.cuh"
#include "iterator/cache_modified_output_iterator.cuh"
#include "iterator/constant_input_iterator.cuh"
#include "iterator/counting_input_iterator.cuh"
#include "iterator/discard_output_iterator.cuh"
#include "iterator/tex_obj_input_iterator.cuh"
#include "iterator/tex_ref_input_iterator.cuh"
#include "iterator/transform_input_iterator.cuh"
// Warp
#include "warp/warp_exchange.hpp"
#include "warp/warp_load.hpp"
#include "warp/warp_merge_sort.hpp"
#include "warp/warp_reduce.cuh"
#include "warp/warp_scan.cuh"
#include "warp/warp_store.hpp"
// Thread
#include "thread/thread_load.cuh"
#include "thread/thread_operators.cuh"
#include "thread/thread_reduce.cuh"
#include "thread/thread_scan.cuh"
#include "thread/thread_search.cuh"
#include "thread/thread_sort.hpp"
#include "thread/thread_store.cuh"
// Block
#include "block/block_discontinuity.cuh"
#include "block/block_exchange.cuh"
#include "block/block_histogram.cuh"
#include "block/block_load.cuh"
#include "block/block_radix_sort.cuh"
#include "block/block_reduce.cuh"
#include "block/block_scan.cuh"
#include "block/block_store.cuh"
// Device
#include "device/device_adjacent_difference.hpp"
#include "device/device_histogram.cuh"
#include "device/device_radix_sort.cuh"
#include "device/device_reduce.cuh"
#include "device/device_run_length_encode.cuh"
#include "device/device_scan.cuh"
#include "device/device_segmented_radix_sort.cuh"
#include "device/device_segmented_reduce.cuh"
#include "device/device_segmented_sort.hpp"
#include "device/device_select.cuh"
#include "device/device_partition.cuh"
#endif // HIPCUB_ROCPRIM_HIPCUB_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment