// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_DEVICE_DEVICE_MERGE_HPP_ #define ROCPRIM_DEVICE_DEVICE_MERGE_HPP_ #include #include #include "../config.hpp" #include "../detail/various.hpp" #include "device_merge_config.hpp" #include "detail/device_merge.hpp" BEGIN_ROCPRIM_NAMESPACE /// \addtogroup devicemodule /// @{ namespace detail { template< class IndexIterator, class KeysInputIterator1, class KeysInputIterator2, class BinaryFunction > ROCPRIM_KERNEL __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void partition_kernel(IndexIterator index, KeysInputIterator1 keys_input1, KeysInputIterator2 keys_input2, const size_t input1_size, const size_t input2_size, const unsigned int spacing, BinaryFunction compare_function) { partition_kernel_impl( index, keys_input1, keys_input2, input1_size, input2_size, spacing, compare_function ); } template< unsigned int BlockSize, unsigned int ItemsPerThread, class IndexIterator, class KeysInputIterator1, class KeysInputIterator2, class KeysOutputIterator, class ValuesInputIterator1, class ValuesInputIterator2, class ValuesOutputIterator, class BinaryFunction > ROCPRIM_KERNEL __launch_bounds__(BlockSize) void merge_kernel(IndexIterator index, KeysInputIterator1 keys_input1, KeysInputIterator2 keys_input2, KeysOutputIterator keys_output, ValuesInputIterator1 values_input1, ValuesInputIterator2 values_input2, ValuesOutputIterator values_output, const size_t input1_size, const size_t input2_size, BinaryFunction compare_function) { merge_kernel_impl( index, keys_input1, keys_input2, keys_output, values_input1, values_input2, values_output, input1_size, input2_size, compare_function ); } #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ auto _error = cudaGetLastError(); \ if(_error != cudaSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ auto __error = cudaStreamSynchronize(stream); \ if(__error != cudaSuccess) return __error; \ auto _end = std::chrono::high_resolution_clock::now(); \ auto _d = std::chrono::duration_cast>(_end - start); \ std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } template< class Config, class KeysInputIterator1, class KeysInputIterator2, class KeysOutputIterator, class ValuesInputIterator1, class ValuesInputIterator2, class ValuesOutputIterator, class BinaryFunction > inline cudaError_t merge_impl(void * temporary_storage, size_t& storage_size, KeysInputIterator1 keys_input1, KeysInputIterator2 keys_input2, KeysOutputIterator keys_output, ValuesInputIterator1 values_input1, ValuesInputIterator2 values_input2, ValuesOutputIterator values_output, const size_t input1_size, const size_t input2_size, BinaryFunction compare_function, const cudaStream_t stream, bool debug_synchronous) { using key_type = typename std::iterator_traits::value_type; using value_type = typename std::iterator_traits::value_type; // Get default config if Config is default_config using config = detail::default_or_custom_config< Config, detail::default_merge_config >; static constexpr unsigned int block_size = config::block_size; static constexpr unsigned int half_block = block_size / 2; static constexpr unsigned int items_per_thread = config::items_per_thread; static constexpr auto items_per_block = block_size * items_per_thread; const unsigned int partitions = ((input1_size + input2_size) + items_per_block - 1) / items_per_block; const size_t partition_bytes = (partitions + 1) * sizeof(unsigned int); if(temporary_storage == nullptr) { // storage_size is never zero storage_size = partition_bytes; return cudaSuccess; } if( partitions == 0u ) return cudaSuccess; // Start point for time measurements std::chrono::high_resolution_clock::time_point start; auto number_of_blocks = partitions; if(debug_synchronous) { std::cout << "block_size " << block_size << '\n'; std::cout << "number of blocks " << number_of_blocks << '\n'; std::cout << "items_per_block " << items_per_block << '\n'; } unsigned int * index = reinterpret_cast(temporary_storage); const unsigned partition_blocks = ((partitions + 1) + half_block - 1) / half_block; if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); detail::partition_kernel <<>>( index, keys_input1, keys_input2, input1_size, input2_size, items_per_block, compare_function ); ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", input1_size, start); if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); detail::merge_kernel <<>>( index, keys_input1, keys_input2, keys_output, values_input1, values_input2, values_output, input1_size, input2_size, compare_function ); ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("merge_kernel", input1_size, start); return cudaSuccess; } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR } // end of detail namespace /// \brief Parallel merge primitive for device level. /// /// \p merge function performs a device-wide merge. /// Function merges two ordered sets of input values based on comparison function. /// /// \par Overview /// * The contents of the inputs are not altered by the merging function. /// * Returns the required size of \p temporary_storage in \p storage_size /// if \p temporary_storage in a null pointer. /// * Accepts custom compare_functions for merging across the device. /// /// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or /// a custom class with the same members. /// \tparam InputIterator1 - random-access iterator type of the first input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam InputIterator2 - random-access iterator type of the second input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the /// requirements of a C++ OutputIterator concept. It can be a simple pointer type. /// /// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When /// a null pointer is passed, the required allocation size (in bytes) is written to /// \p storage_size and function returns without performing the sort operation. /// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage. /// \param [in] input1 - iterator to the first element in the first range to merge. /// \param [in] input2 - iterator to the first element in the second range to merge. /// \param [out] output - iterator to the first element in the output range. /// \param [in] input1_size - number of element in the first input range. /// \param [in] input2_size - number of element in the second input range. /// \param [in] compare_function - binary operation function object that will be used for comparison. /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// The default value is \p BinaryFunction(). /// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream). /// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel /// launch is forced in order to check for errors. Default value is \p false. /// /// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of /// type \p cudaError_t. /// /// \par Example /// \parblock /// In this example a device-level ascending merge is performed on an array of /// \p int values. /// /// \code{.cpp} /// #include /// /// // Prepare input and output (declare pointers, allocate device memory etc.) /// size_t input_size1; // e.g., 4 /// size_t input_size2; // e.g., 4 /// int * input1; // e.g., [0, 1, 2, 3] /// int * input2; // e.g., [0, 1, 2, 3] /// int * output; // empty array of 8 elements /// /// size_t temporary_storage_size_bytes; /// void * temporary_storage_ptr = nullptr; /// // Get required size of the temporary storage /// rocprim::merge( /// temporary_storage_ptr, temporary_storage_size_bytes, /// input1, input2, output, input_size1, input_size2 /// ); /// /// // allocate temporary storage /// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes); /// /// // perform merge /// rocprim::merge( /// temporary_storage_ptr, temporary_storage_size_bytes, /// input1, input2, output, input_size1, input_size2 /// ); /// // output: [0, 0, 1, 1, 2, 2, 3, 3] /// \endcode /// \endparblock template< class Config = default_config, class InputIterator1, class InputIterator2, class OutputIterator, class BinaryFunction = ::rocprim::less::value_type> > inline cudaError_t merge(void * temporary_storage, size_t& storage_size, InputIterator1 input1, InputIterator2 input2, OutputIterator output, const size_t input1_size, const size_t input2_size, BinaryFunction compare_function = BinaryFunction(), const cudaStream_t stream = 0, bool debug_synchronous = false) { empty_type * values = nullptr; return detail::merge_impl( temporary_storage, storage_size, input1, input2, output, values, values, values, input1_size, input2_size, compare_function, stream, debug_synchronous ); } /// \brief Parallel merge primitive for device level. /// /// \p merge function performs a device-wide merge of (key, value) pairs. /// Function merges two ordered sets of input keys and corresponding values /// based on key comparison function. /// /// \par Overview /// * The contents of the inputs are not altered by the merging function. /// * Returns the required size of \p temporary_storage in \p storage_size /// if \p temporary_storage in a null pointer. /// * Accepts custom compare_functions for merging across the device. /// /// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or /// a custom class with the same members. /// \tparam KeysInputIterator1 - random-access iterator type of the first keys input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysInputIterator2 - random-access iterator type of the second keys input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the keys output range. Must meet the /// requirements of a C++ OutputIterator concept. It can be a simple pointer type. /// \tparam ValuesInputIterator1 - random-access iterator type of the first values input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam ValuesInputIterator2 - random-access iterator type of the second values input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam ValuesOutputIterator - random-access iterator type of the values output range. Must meet the /// requirements of a C++ OutputIterator concept. It can be a simple pointer type. /// /// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When /// a null pointer is passed, the required allocation size (in bytes) is written to /// \p storage_size and function returns without performing the sort operation. /// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage. /// \param [in] keys_input1 - iterator to the first key in the first range to merge. /// \param [in] keys_input2 - iterator to the first key in the second range to merge. /// \param [out] keys_output - iterator to the first key in the output range. /// \param [in] values_input1 - iterator to the first value in the first range to merge. /// \param [in] values_input2 - iterator to the first value in the second range to merge. /// \param [out] values_output - iterator to the first value in the output range. /// \param [in] input1_size - number of element in the first input range. /// \param [in] input2_size - number of element in the second input range. /// \param [in] compare_function - binary operation function object that will be used for key comparison. /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// The default value is \p BinaryFunction(). /// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream). /// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel /// launch is forced in order to check for errors. Default value is \p false. /// /// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of /// type \p cudaError_t. /// /// \par Example /// \parblock /// In this example a device-level ascending merge is performed on an array of /// \p int values. /// /// \code{.cpp} /// #include /// /// // Prepare input and output (declare pointers, allocate device memory etc.) /// size_t input_size1; // e.g., 4 /// size_t input_size2; // e.g., 4 /// int * keys_input1; // e.g., [0, 1, 2, 3] /// int * keys_input2; // e.g., [0, 1, 2, 3] /// int * keys_output; // empty array of 8 elements /// int * values_input1; // e.g., [10, 11, 12, 13] /// int * values_input2; // e.g., [20, 21, 22, 23] /// int * values_output; // empty array of 8 elements /// /// size_t temporary_storage_size_bytes; /// void * temporary_storage_ptr = nullptr; /// // Get required size of the temporary storage /// rocprim::merge( /// temporary_storage_ptr, temporary_storage_size_bytes, /// keys_input1, keys_input2, keys_output, /// values_input1, values_input2, values_output, // input_size1, input_size2 /// ); /// /// // allocate temporary storage /// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes); /// /// // perform merge /// rocprim::merge( /// temporary_storage_ptr, temporary_storage_size_bytes, /// keys_input1, keys_input2, keys_output, /// values_input1, values_input2, values_output, // input_size1, input_size2 /// ); /// // keys_output: [0, 0, 1, 1, 2, 2, 3, 3] /// // values_output: [10, 20, 11, 21, 12, 22, 13, 23] /// \endcode /// \endparblock template< class Config = default_config, class KeysInputIterator1, class KeysInputIterator2, class KeysOutputIterator, class ValuesInputIterator1, class ValuesInputIterator2, class ValuesOutputIterator, class BinaryFunction = ::rocprim::less::value_type> > inline cudaError_t merge(void * temporary_storage, size_t& storage_size, KeysInputIterator1 keys_input1, KeysInputIterator2 keys_input2, KeysOutputIterator keys_output, ValuesInputIterator1 values_input1, ValuesInputIterator2 values_input2, ValuesOutputIterator values_output, const size_t input1_size, const size_t input2_size, BinaryFunction compare_function = BinaryFunction(), const cudaStream_t stream = 0, bool debug_synchronous = false) { return detail::merge_impl( temporary_storage, storage_size, keys_input1, keys_input2, keys_output, values_input1, values_input2, values_output, input1_size, input2_size, compare_function, stream, debug_synchronous ); } /// @} // end of group devicemodule END_ROCPRIM_NAMESPACE #endif // ROCPRIM_DEVICE_DEVICE_MERGE_HPP_