// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_WARP_WARP_EXCHANGE_HPP_ #define ROCPRIM_WARP_WARP_EXCHANGE_HPP_ #include "../config.hpp" #include "../detail/various.hpp" #include "../intrinsics.hpp" #include "../intrinsics/warp_shuffle.hpp" #include "../functional.hpp" #include "../types.hpp" /// \addtogroup warpmodule /// @{ BEGIN_ROCPRIM_NAMESPACE /// \brief The \p warp_exchange class is a warp level parallel primitive which provides /// methods for rearranging items partitioned across threads in a warp. /// /// \tparam T - the input type. /// \tparam ItemsPerThread - the number of items contributed by each thread. /// \tparam WarpSize - the number of threads in a warp. /// /// \par Overview /// * The \p warp_exchange class supports the following rearrangement methods: /// * Transposing a blocked arrangement to a striped arrangement. /// * Transposing a striped arrangement to a blocked arrangement. /// /// \par Examples /// \parblock /// In the example an exchange operation is performed on a warp of 8 threads, using type /// \p int with 4 items per thread. /// /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// constexpr unsigned int threads_per_block = 128; /// constexpr unsigned int threads_per_warp = 8; /// constexpr unsigned int items_per_thread = 4; /// constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; /// const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; /// // specialize warp_exchange for int, warp of 8 threads and 4 items per thread /// using warp_exchange_int = rocprim::warp_exchange; /// // allocate storage in shared memory /// __shared__ warp_exchange_int::storage_type storage[warps_per_block]; /// /// int items[items_per_thread]; /// ... /// warp_exchange_int w_exchange; /// w_exchange.blocked_to_striped(items, items, storage[warp_id]); /// ... /// } /// \endcode /// \endparblock template< class T, unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size() > class warp_exchange { static_assert(::rocprim::detail::is_power_of_two(WarpSize), "Logical warp size must be a power of two."); static_assert(WarpSize <= ::rocprim::device_warp_size(), "Logical warp size cannot be larger than physical warp size."); // Struct used for creating a raw_storage object for this primitive's temporary storage. struct storage_type_ { T buffer[WarpSize * ItemsPerThread]; }; public: /// \brief Struct used to allocate a temporary memory that is required for thread /// communication during operations provided by the related parallel primitive. /// /// Depending on the implemention the operations exposed by parallel primitive may /// require a temporary storage for thread communication. The storage should be allocated /// using keywords __shared__. It can be aliased to /// an externally allocated memory, or be a part of a union type with other storage types /// to increase shared memory reusability. #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen using storage_type = detail::raw_storage; #else using storage_type = storage_type_; // only for Doxygen #endif /// \brief Transposes a blocked arrangement of items to a striped arrangement /// across the warp, using temporary storage. /// /// \tparam U - [inferred] the output type. /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Example. /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// constexpr unsigned int threads_per_block = 128; /// constexpr unsigned int threads_per_warp = 8; /// constexpr unsigned int items_per_thread = 4; /// constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; /// const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; /// // specialize warp_exchange for int, warp of 8 threads and 4 items per thread /// using warp_exchange_int = rocprim::warp_exchange; /// // allocate storage in shared memory /// __shared__ warp_exchange_int::storage_type storage[warps_per_block]; /// /// int items[items_per_thread]; /// ... /// warp_exchange_int w_exchange; /// w_exchange.blocked_to_striped(items, items, storage[warp_id]); /// ... /// } /// \endcode template ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type& storage) { const unsigned int flat_id = ::rocprim::detail::logical_lane_id(); storage_type_& storage_ = storage.get(); ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { storage_.buffer[flat_id * ItemsPerThread + i] = input[i]; } ::rocprim::wave_barrier(); ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { output[i] = storage_.buffer[i * WarpSize + flat_id]; } } /// \brief Transposes a blocked arrangement of items to a striped arrangement /// across the warp, using warp shuffle operations. /// Caution: this API is experimental. Performance might not be consistent. /// ItemsPerThread must be a divisor of WarpSize. /// /// \tparam U - [inferred] the output type. /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. /// /// \par Example. /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// constexpr unsigned int threads_per_block = 128; /// constexpr unsigned int threads_per_warp = 8; /// constexpr unsigned int items_per_thread = 4; /// constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; /// const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; /// // specialize warp_exchange for int, warp of 8 threads and 4 items per thread /// using warp_exchange_int = rocprim::warp_exchange; /// /// int items[items_per_thread]; /// ... /// warp_exchange_int w_exchange; /// w_exchange.blocked_to_striped_shuffle(items, items); /// ... /// } /// \endcode template ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped_shuffle(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread]) { static_assert(WarpSize % ItemsPerThread == 0, "ItemsPerThread must be a divisor of WarpSize to use blocked_to_striped_shuffle"); const unsigned int flat_id = ::rocprim::detail::logical_lane_id(); U work_array[ItemsPerThread]; ROCPRIM_UNROLL for(unsigned int dst_idx = 0; dst_idx < ItemsPerThread; dst_idx++) { ROCPRIM_UNROLL for(unsigned int src_idx = 0; src_idx < ItemsPerThread; src_idx++) { const auto value = ::rocprim::warp_shuffle( input[src_idx], flat_id / ItemsPerThread + dst_idx * (WarpSize / ItemsPerThread) ); if(src_idx == flat_id % ItemsPerThread) { work_array[dst_idx] = value; } } } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { output[i] = work_array[i]; } } /// \brief Transposes a striped arrangement of items to a blocked arrangement /// across the warp, using temporary storage. /// /// \tparam U - [inferred] the output type. /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Example. /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// constexpr unsigned int threads_per_block = 128; /// constexpr unsigned int threads_per_warp = 8; /// constexpr unsigned int items_per_thread = 4; /// constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; /// const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; /// // specialize warp_exchange for int, warp of 8 threads and 4 items per thread /// using warp_exchange_int = rocprim::warp_exchange; /// // allocate storage in shared memory /// __shared__ warp_exchange_int::storage_type storage[warps_per_block]; /// /// int items[items_per_thread]; /// ... /// warp_exchange_int w_exchange; /// w_exchange.striped_to_blocked(items, items, storage[warp_id]); /// ... /// } /// \endcode template ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type& storage) { const unsigned int flat_id = ::rocprim::detail::logical_lane_id(); storage_type_& storage_ = storage.get(); ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { storage_.buffer[i * WarpSize + flat_id] = input[i]; } ::rocprim::wave_barrier(); ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { output[i] = storage_.buffer[flat_id * ItemsPerThread + i]; } } /// \brief Transposes a striped arrangement of items to a blocked arrangement /// across the warp, using warp shuffle operations. /// Caution: this API is experimental. Performance might not be consistent. /// ItemsPerThread must be a divisor of WarpSize. /// /// \tparam U - [inferred] the output type. /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. /// /// \par Example. /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// constexpr unsigned int threads_per_block = 128; /// constexpr unsigned int threads_per_warp = 8; /// constexpr unsigned int items_per_thread = 4; /// constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; /// const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; /// // specialize warp_exchange for int, warp of 8 threads and 4 items per thread /// using warp_exchange_int = rocprim::warp_exchange; /// /// int items[items_per_thread]; /// ... /// warp_exchange_int w_exchange; /// w_exchange.striped_to_blocked_shuffle(items, items); /// ... /// } /// \endcode template ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked_shuffle(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread]) { static_assert(WarpSize % ItemsPerThread == 0, "ItemsPerThread must be a divisor of WarpSize to use striped_to_blocked_shuffle"); const unsigned int flat_id = ::rocprim::detail::logical_lane_id(); U work_array[ItemsPerThread]; ROCPRIM_UNROLL for(unsigned int dst_idx = 0; dst_idx < ItemsPerThread; dst_idx++) { ROCPRIM_UNROLL for(unsigned int src_idx = 0; src_idx < ItemsPerThread; src_idx++) { const auto value = ::rocprim::warp_shuffle( input[src_idx], (ItemsPerThread * flat_id + dst_idx) % WarpSize ); if(flat_id / (WarpSize / ItemsPerThread) == src_idx) { work_array[dst_idx] = value; } } } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { output[i] = work_array[i]; } } /// \brief Orders \p input values according to ranks using temporary storage, /// then writes the values to \p output in a striped manner. /// No values in \p ranks should exists that exceed \p WarpSize*ItemsPerThread-1 . /// \tparam U - [inferred] the output type. /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. /// \param [in] ranks - array containing the positions. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Example. /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// constexpr unsigned int threads_per_block = 128; /// constexpr unsigned int threads_per_warp = 8; /// constexpr unsigned int items_per_thread = 4; /// constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; /// const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; /// // specialize warp_exchange for int, warp of 8 threads and 4 items per thread /// using warp_exchange_int = rocprim::warp_exchange; /// // allocate storage in shared memory /// __shared__ warp_exchange_int::storage_type storage[warps_per_block]; /// /// int items[items_per_thread]; /// /// // data-type of `ranks` should be able to contain warp_size*items_per_thread unique elements /// // unsigned short is sufficient for up to 1024*64 elements /// unsigned short ranks[items_per_thread]; /// ... /// warp_exchange_int w_exchange; /// w_exchange.scatter_to_striped(items, items, ranks, storage[warp_id]); /// ... /// } /// \endcode template ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped( const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const OffsetT (&ranks)[ItemsPerThread], storage_type& storage) { const unsigned int flat_id = ::rocprim::detail::logical_lane_id(); storage_type_& storage_ = storage.get(); ROCPRIM_UNROLL for (unsigned int i = 0; i < ItemsPerThread; i++) { storage_.buffer[ranks[i]] = input[i]; } ::rocprim::wave_barrier(); ROCPRIM_UNROLL for (unsigned int i = 0; i < ItemsPerThread; i++) { unsigned int item_offset = (i * WarpSize) + flat_id; output[i] = storage_.buffer[item_offset]; } } }; END_ROCPRIM_NAMESPACE /// @} // end of group warpmodule #endif // ROCPRIM_WARP_WARP_EXCHANGE_HPP_