// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_ #define ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_ #include #include "../config.hpp" #include "thread.hpp" /// \addtogroup warpmodule /// @{ BEGIN_ROCPRIM_NAMESPACE namespace detail { #ifdef __HIP_CPU_RT__ // TODO: consider adding macro checks relaying to std::bit_cast when compiled // using C++20. template typename std::enable_if_t< sizeof(To) == sizeof(From) && std::is_trivially_copyable_v && std::is_trivially_copyable_v, To> // constexpr support needs compiler magic bit_cast(const From& src) noexcept { To dst; std::memcpy(&dst, &src, sizeof(To)); return dst; } #endif template ROCPRIM_DEVICE ROCPRIM_INLINE typename std::enable_if::value && (sizeof(T) % sizeof(int) == 0), T>::type warp_shuffle_op(const T& input, ShuffleOp&& op) { constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); struct V { int words[words_no]; }; #ifdef __HIP_CPU_RT__ V a = bit_cast(input); #else V a = __builtin_bit_cast(V, input); #endif ROCPRIM_UNROLL for(int i = 0; i < words_no; i++) { a.words[i] = op(a.words[i]); } #ifdef __HIP_CPU_RT__ return bit_cast(a); #else return __builtin_bit_cast(T, a); #endif } template ROCPRIM_DEVICE ROCPRIM_INLINE typename std::enable_if::value && (sizeof(T) % sizeof(int) == 0)), T>::type warp_shuffle_op(const T& input, ShuffleOp&& op) { constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); T output; ROCPRIM_UNROLL for(int i = 0; i < words_no; i++) { const size_t s = std::min(sizeof(int), sizeof(T) - i * sizeof(int)); int word; #ifdef __HIP_CPU_RT__ std::memcpy(&word, reinterpret_cast(&input) + i * sizeof(int), s); #else __builtin_memcpy(&word, reinterpret_cast(&input) + i * sizeof(int), s); #endif word = op(word); #ifdef __HIP_CPU_RT__ std::memcpy(reinterpret_cast(&output) + i * sizeof(int), &word, s); #else __builtin_memcpy(reinterpret_cast(&output) + i * sizeof(int), &word, s); #endif } return output; } template ROCPRIM_DEVICE ROCPRIM_INLINE T warp_move_dpp(const T& input) { return detail::warp_shuffle_op( input, [=](int v) -> int { // TODO: clean-up, this function activates based ROCPRIM_DETAIL_USE_DPP, however inclusion and // parsing of the template happens unconditionally. The condition causing compilation to // fail is ordinary host-compilers looking at the headers. Non-hipcc compilers don't define // __builtin_amdgcn_update_dpp, hence fail to parse the template altogether. (Except MSVC // because even using /permissive- they somehow still do delayed parsing of the body of // function templates, even though they pinky-swear they don't.) #if !defined(__HIP_CPU_RT__) return ::__builtin_amdgcn_mov_dpp(v, dpp_ctrl, row_mask, bank_mask, bound_ctrl); #else return v; #endif } ); } /// \brief Swizzle for any data type. /// /// Each thread in warp obtains \p input from src_lane-th thread /// in warp, where src_lane is current lane with a mask applied. /// /// \param input - input to pass to other threads template ROCPRIM_DEVICE ROCPRIM_INLINE T warp_swizzle(const T& input) { return detail::warp_shuffle_op( input, [=](int v) -> int { return ::__builtin_amdgcn_ds_swizzle(v, mask); } ); } } // end namespace detail /// \brief Shuffle for any data type. /// /// Each thread in warp obtains \p input from src_lane-th thread /// in warp. If \p width is less than device_warp_size() then each subsection of the /// warp behaves as a separate entity with a starting logical lane id of 0. /// If \p src_lane is not in [0; \p width) range, the returned value is /// equal to \p input passed by the src_lane modulo width thread. /// /// Note: The optional \p width parameter must be a power of 2; results are /// undefined if it is not a power of 2, or it is greater than device_warp_size(). /// /// \param input - input to pass to other threads /// \param src_lane - warp if of a thread whose \p input should be returned /// \param width - logical warp width template ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle(const T& input, const int src_lane, const int width = device_warp_size()) { return detail::warp_shuffle_op( input, [=](int v) -> int { return __shfl(v, src_lane, width); } ); } /// \brief Shuffle up for any data type. /// /// i-th thread in warp obtains \p input from i-delta-th /// thread in warp. If \p i-delta is not in [0; \p width) range, /// thread's own \p input is returned. /// /// Note: The optional \p width parameter must be a power of 2; results are /// undefined if it is not a power of 2, or it is greater than device_warp_size(). /// /// \param input - input to pass to other threads /// \param delta - offset for calculating source lane id /// \param width - logical warp width template ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle_up(const T& input, const unsigned int delta, const int width = device_warp_size()) { return detail::warp_shuffle_op( input, [=](int v) -> int { return __shfl_up(v, delta, width); } ); } /// \brief Shuffle down for any data type. /// /// i-th thread in warp obtains \p input from i+delta-th /// thread in warp. If \p i+delta is not in [0; \p width) range, /// thread's own \p input is returned. /// /// Note: The optional \p width parameter must be a power of 2; results are /// undefined if it is not a power of 2, or it is greater than device_warp_size(). /// /// \param input - input to pass to other threads /// \param delta - offset for calculating source lane id /// \param width - logical warp width template ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle_down(const T& input, const unsigned int delta, const int width = device_warp_size()) { return detail::warp_shuffle_op( input, [=](int v) -> int { return __shfl_down(v, delta, width); } ); } /// \brief Shuffle XOR for any data type. /// /// i-th thread in warp obtains \p input from i^lane_mask-th /// thread in warp. /// /// Note: The optional \p width parameter must be a power of 2; results are /// undefined if it is not a power of 2, or it is greater than device_warp_size(). /// /// \param input - input to pass to other threads /// \param lane_mask - mask used for calculating source lane id /// \param width - logical warp width template ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle_xor(const T& input, const int lane_mask, const int width = device_warp_size()) { return detail::warp_shuffle_op( input, [=](int v) -> int { return __shfl_xor(v, lane_mask, width); } ); } END_ROCPRIM_NAMESPACE #endif // ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_ /// @} // end of group warpmodule