Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
GLM-130B_fastertransformer
Commits
f8a481f8
Commit
f8a481f8
authored
Oct 13, 2023
by
zhouxiang
Browse files
添加dtk中的cub头文件
parent
7b7c64c5
Changes
147
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
9961 additions
and
0 deletions
+9961
-0
3rdparty/cub/iterator/constant_input_iterator.cuh
3rdparty/cub/iterator/constant_input_iterator.cuh
+60
-0
3rdparty/cub/iterator/counting_input_iterator.cuh
3rdparty/cub/iterator/counting_input_iterator.cuh
+60
-0
3rdparty/cub/iterator/discard_output_iterator.cuh
3rdparty/cub/iterator/discard_output_iterator.cuh
+231
-0
3rdparty/cub/iterator/tex_obj_input_iterator.cuh
3rdparty/cub/iterator/tex_obj_input_iterator.cuh
+88
-0
3rdparty/cub/iterator/tex_ref_input_iterator.cuh
3rdparty/cub/iterator/tex_ref_input_iterator.cuh
+87
-0
3rdparty/cub/iterator/transform_input_iterator.cuh
3rdparty/cub/iterator/transform_input_iterator.cuh
+63
-0
3rdparty/cub/rocprim/block/block_adjacent_difference.hpp
3rdparty/cub/rocprim/block/block_adjacent_difference.hpp
+1155
-0
3rdparty/cub/rocprim/block/block_discontinuity.hpp
3rdparty/cub/rocprim/block/block_discontinuity.hpp
+803
-0
3rdparty/cub/rocprim/block/block_exchange.hpp
3rdparty/cub/rocprim/block/block_exchange.hpp
+769
-0
3rdparty/cub/rocprim/block/block_histogram.hpp
3rdparty/cub/rocprim/block/block_histogram.hpp
+328
-0
3rdparty/cub/rocprim/block/block_load.hpp
3rdparty/cub/rocprim/block/block_load.hpp
+891
-0
3rdparty/cub/rocprim/block/block_load_func.hpp
3rdparty/cub/rocprim/block/block_load_func.hpp
+511
-0
3rdparty/cub/rocprim/block/block_radix_sort.hpp
3rdparty/cub/rocprim/block/block_radix_sort.hpp
+1016
-0
3rdparty/cub/rocprim/block/block_reduce.hpp
3rdparty/cub/rocprim/block/block_reduce.hpp
+414
-0
3rdparty/cub/rocprim/block/block_scan.hpp
3rdparty/cub/rocprim/block/block_scan.hpp
+1322
-0
3rdparty/cub/rocprim/block/block_shuffle.hpp
3rdparty/cub/rocprim/block/block_shuffle.hpp
+490
-0
3rdparty/cub/rocprim/block/block_sort.hpp
3rdparty/cub/rocprim/block/block_sort.hpp
+373
-0
3rdparty/cub/rocprim/block/block_store.hpp
3rdparty/cub/rocprim/block/block_store.hpp
+560
-0
3rdparty/cub/rocprim/block/block_store_func.hpp
3rdparty/cub/rocprim/block/block_store_func.hpp
+393
-0
3rdparty/cub/rocprim/block/detail/block_adjacent_difference_impl.hpp
...b/rocprim/block/detail/block_adjacent_difference_impl.hpp
+347
-0
No files found.
Too many changes to show.
To preserve performance only
147 of 147+
files are displayed.
Plain diff
Email patch
3rdparty/cub/iterator/constant_input_iterator.cuh
0 → 100644
View file @
f8a481f8
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/constant_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template
<
typename
ValueType
,
typename
OffsetT
=
std
::
ptrdiff_t
>
using
ConstantInputIterator
=
::
rocprim
::
constant_iterator
<
ValueType
,
OffsetT
>
;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
3rdparty/cub/iterator/counting_input_iterator.cuh
0 → 100644
View file @
f8a481f8
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/counting_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template
<
typename
ValueType
,
typename
OffsetT
=
std
::
ptrdiff_t
>
using
CountingInputIterator
=
::
rocprim
::
counting_iterator
<
ValueType
,
OffsetT
>
;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
3rdparty/cub/iterator/discard_output_iterator.cuh
0 → 100644
View file @
f8a481f8
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
BEGIN_HIPCUB_NAMESPACE
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
/**
* \addtogroup UtilIterator
* @{
*/
/**
* \brief A discard iterator
*/
template
<
typename
OffsetT
=
ptrdiff_t
>
class
DiscardOutputIterator
{
public:
// Required iterator traits
typedef
DiscardOutputIterator
self_type
;
///< My own type
typedef
OffsetT
difference_type
;
///< Type to express the result of subtracting one iterator from another
typedef
void
value_type
;
///< The type of the element the iterator can point to
typedef
void
pointer
;
///< The type of a pointer to an element the iterator can point to
typedef
void
reference
;
///< The type of a reference to an element the iterator can point to
#if (THRUST_VERSION >= 100700)
// Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
typedef
typename
thrust
::
detail
::
iterator_facade_category
<
thrust
::
any_system_tag
,
thrust
::
random_access_traversal_tag
,
value_type
,
reference
>::
type
iterator_category
;
///< The iterator category
#else
typedef
std
::
random_access_iterator_tag
iterator_category
;
///< The iterator category
#endif // THRUST_VERSION
private:
OffsetT
offset
;
public:
/// Constructor
__host__
__device__
__forceinline__
DiscardOutputIterator
(
OffsetT
offset
=
0
)
///< Base offset
:
offset
(
offset
)
{}
/**
* @typedef self_type
* @brief Postfix increment
*/
__host__
__device__
__forceinline__
self_type
operator
++
(
int
)
{
self_type
retval
=
*
this
;
offset
++
;
return
retval
;
}
/**
* @typedef self_type
* @brief Postfix increment
*/
__host__
__device__
__forceinline__
self_type
operator
++
()
{
offset
++
;
return
*
this
;
}
/**
* @typedef self_type
* @brief Indirection
*/
__host__
__device__
__forceinline__
self_type
&
operator
*
()
{
// return self reference, which can be assigned to anything
return
*
this
;
}
/**
* @typedef self_type
* @brief Addition
*/
template
<
typename
Distance
>
__host__
__device__
__forceinline__
self_type
operator
+
(
Distance
n
)
const
{
self_type
retval
(
offset
+
n
);
return
retval
;
}
/**
* @typedef self_type
* @brief Addition assignment
*/
template
<
typename
Distance
>
__host__
__device__
__forceinline__
self_type
&
operator
+=
(
Distance
n
)
{
offset
+=
n
;
return
*
this
;
}
/**
* @typedef self_type
* @brief Subtraction assignment
*/
template
<
typename
Distance
>
__host__
__device__
__forceinline__
self_type
operator
-
(
Distance
n
)
const
{
self_type
retval
(
offset
-
n
);
return
retval
;
}
/**
* @typedef self_type
* @brief Subtraction assignment
*/
template
<
typename
Distance
>
__host__
__device__
__forceinline__
self_type
&
operator
-=
(
Distance
n
)
{
offset
-=
n
;
return
*
this
;
}
/**
* @typedef self_type
* @brief Distance
*/
__host__
__device__
__forceinline__
difference_type
operator
-
(
self_type
other
)
const
{
return
offset
-
other
.
offset
;
}
/**
* @typedef self_type
* @brief Array subscript
*/
template
<
typename
Distance
>
__host__
__device__
__forceinline__
self_type
&
operator
[](
Distance
)
{
// return self reference, which can be assigned to anything
return
*
this
;
}
/// Structure dereference
__host__
__device__
__forceinline__
pointer
operator
->
()
{
return
;
}
/// Assignment to anything else (no-op)
template
<
typename
T
>
__host__
__device__
__forceinline__
void
operator
=
(
T
const
&
)
{}
/// Cast to void* operator
__host__
__device__
__forceinline__
operator
void
*
()
const
{
return
NULL
;
}
/**
* @typedef self_type
* @brief Equal to
*/
__host__
__device__
__forceinline__
bool
operator
==
(
const
self_type
&
rhs
)
{
return
(
offset
==
rhs
.
offset
);
}
/**
* @typedef self_type
* @brief Not equal to
*/
__host__
__device__
__forceinline__
bool
operator
!=
(
const
self_type
&
rhs
)
{
return
(
offset
!=
rhs
.
offset
);
}
/**
* @typedef self_type
* @brief ostream operator
*/
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
self_type
&
itr
)
{
os
<<
"["
<<
itr
.
offset
<<
"]"
;
return
os
;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
3rdparty/cub/iterator/tex_obj_input_iterator.cuh
0 → 100644
View file @
f8a481f8
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
#include <cub/rocprim/iterator/texture_cache_iterator.hpp>
BEGIN_HIPCUB_NAMESPACE
template
<
typename
T
,
typename
OffsetT
=
std
::
ptrdiff_t
>
class
TexObjInputIterator
:
public
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>
{
public:
template
<
class
Qualified
>
inline
cudaError_t
BindTexture
(
Qualified
*
ptr
,
size_t
bytes
=
size_t
(
-
1
),
size_t
texture_offset
=
0
)
{
return
(
cudaError_t
)
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>::
bind_texture
(
ptr
,
bytes
,
texture_offset
);
}
inline
cudaError_t
UnbindTexture
()
{
return
(
cudaError_t
)
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>::
unbind_texture
();
}
HIPCUB_HOST_DEVICE
inline
~
TexObjInputIterator
()
=
default
;
HIPCUB_HOST_DEVICE
inline
TexObjInputIterator
()
:
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>
()
{
}
HIPCUB_HOST_DEVICE
inline
TexObjInputIterator
(
const
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>
other
)
:
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>
(
other
)
{
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
3rdparty/cub/iterator/tex_ref_input_iterator.cuh
0 → 100644
View file @
f8a481f8
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
#include <rocprim/iterator/texture_cache_iterator.hpp>
BEGIN_HIPCUB_NAMESPACE
template
<
typename
T
,
int
UNIQUE_ID
,
// Unused parameter for compatibility with original definition in cub
typename
OffsetT
=
std
::
ptrdiff_t
>
class
TexRefInputIterator
:
public
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>
{
public:
template
<
class
Qualified
>
inline
cudaError_t
BindTexture
(
Qualified
*
ptr
,
size_t
bytes
=
size_t
(
-
1
),
size_t
texture_offset
=
0
)
{
return
(
cudaError_t
)
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>::
bind_texture
(
ptr
,
bytes
,
texture_offset
);
}
inline
cudaError_t
UnbindTexture
()
{
return
(
cudaError_t
)
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>::
unbind_texture
();
}
HIPCUB_HOST_DEVICE
inline
~
TexRefInputIterator
()
=
default
;
HIPCUB_HOST_DEVICE
inline
TexRefInputIterator
()
:
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>
()
{
}
HIPCUB_HOST_DEVICE
inline
TexRefInputIterator
(
const
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>
other
)
:
::
rocprim
::
texture_cache_iterator
<
T
,
OffsetT
>
(
other
)
{
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
3rdparty/cub/iterator/transform_input_iterator.cuh
0 → 100644
View file @
f8a481f8
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/transform_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template
<
typename
ValueType
,
typename
ConversionOp
,
typename
InputIteratorT
,
typename
OffsetT
=
std
::
ptrdiff_t
// ignored
>
using
TransformInputIterator
=
::
rocprim
::
transform_iterator
<
InputIteratorT
,
ConversionOp
,
ValueType
>
;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
3rdparty/cub/rocprim/block/block_adjacent_difference.hpp
0 → 100644
View file @
f8a481f8
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
#define ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
#include "detail/block_adjacent_difference_impl.hpp"
#include "../config.hpp"
#include "../detail/various.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_adjacent_difference class is a block level parallel primitive which provides
/// methods for applying binary functions for pairs of consecutive items partition across a thread
/// block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
///
/// \par Overview
/// * There are two types of flags:
/// * Head flags.
/// * Tail flags.
/// * The above flags are used to differentiate items from their predecessors or successors.
/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
/// segmented reduction/scan.
///
/// \par Examples
/// \parblock
/// In the examples discontinuity operation is performed on block of 128 threads, using type
/// \p int.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_adjacent_difference
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
:
private
detail
::
block_adjacent_difference_impl
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
#endif // DOXYGEN_SHOULD_SKIP_THIS
{
using
base_type
=
detail
::
block_adjacent_difference_impl
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
;
static
constexpr
unsigned
BlockSize
=
base_type
::
BlockSize
;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct
storage_type_
{
typename
base_type
::
storage_type
left
;
typename
base_type
::
storage_type
right
;
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
detail
::
raw_storage
<
storage_type_
>
;
#else
using
storage_type
=
storage_type_
;
#endif
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item has no reference and is always
/// flagged.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
false
;
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
input
,
head_flags
,
flag_op
,
input
[
0
]
/* predecessor */
,
storage
.
get
().
left
);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
/// This overload does not take a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads
(
head_flags
,
input
,
flag_op
,
storage
);
}
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item of the first thread is compared against
/// a \p tile_predecessor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
true
;
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
input
,
head_flags
,
flag_op
,
tile_predecessor_item
,
storage
.
get
().
left
);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads
(
head_flags
,
tile_predecessor_item
,
input
,
flag_op
,
storage
);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item has no reference and is always
/// flagged.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_tails
(
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_successor
=
false
;
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
input
,
tail_flags
,
flag_op
,
input
[
0
]
/* successor */
,
storage
.
get
().
right
);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_tails
(
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_tails
(
tail_flags
,
input
,
flag_op
,
storage
);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item of the last thread is compared against
/// a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_tails
(
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_successor
=
true
;
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
input
,
tail_flags
,
flag_op
,
tile_successor_item
,
storage
.
get
().
right
);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_tails
(
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_tails
(
tail_flags
,
tile_successor_item
,
input
,
flag_op
,
storage
);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
/// flag_op_type(), storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
false
;
static
constexpr
auto
with_successor
=
false
;
// Copy items in case head_flags is aliased with input
T
items
[
ItemsPerThread
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
++
i
)
{
items
[
i
]
=
input
[
i
];
}
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
items
,
head_flags
,
flag_op
,
items
[
0
]
/*predecessor*/
,
storage
.
get
().
left
);
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
items
,
tail_flags
,
flag_op
,
items
[
0
]
/*successor*/
,
storage
.
get
().
right
);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads_and_tails
(
head_flags
,
tail_flags
,
input
,
flag_op
,
storage
);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the last item of the
/// last thread is compared against a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
false
;
static
constexpr
auto
with_successor
=
true
;
// Copy items in case head_flags is aliased with input
T
items
[
ItemsPerThread
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
++
i
)
{
items
[
i
]
=
input
[
i
];
}
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
items
,
head_flags
,
flag_op
,
items
[
0
]
/*predecessor*/
,
storage
.
get
().
left
);
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
items
,
tail_flags
,
flag_op
,
tile_successor_item
,
storage
.
get
().
right
);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads_and_tails
(
head_flags
,
tail_flags
,
tile_successor_item
,
input
,
flag_op
,
storage
);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first item of the
/// first thread is compared against a \p tile_predecessor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
true
;
static
constexpr
auto
with_successor
=
false
;
// Copy items in case head_flags is aliased with input
T
items
[
ItemsPerThread
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
++
i
)
{
items
[
i
]
=
input
[
i
];
}
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
items
,
head_flags
,
flag_op
,
tile_predecessor_item
,
storage
.
get
().
left
);
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
items
,
tail_flags
,
flag_op
,
items
[
0
]
/*successor*/
,
storage
.
get
().
right
);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads_and_tails
(
head_flags
,
tile_predecessor_item
,
tail_flags
,
input
,
flag_op
,
storage
);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first and last items of
/// the first and last thread is compared against a \p tile_predecessor_item and
/// a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_predecessor_item = 0;
/// int tile_successor_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_predecessor_item = ...
/// tile_successor_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
/// tail_flags, tile_successor_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
true
;
static
constexpr
auto
with_successor
=
true
;
// Copy items in case head_flags is aliased with input
T
items
[
ItemsPerThread
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
++
i
)
{
items
[
i
]
=
input
[
i
];
}
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
items
,
head_flags
,
flag_op
,
tile_predecessor_item
,
storage
.
get
().
left
);
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
items
,
tail_flags
,
flag_op
,
tile_successor_item
,
storage
.
get
().
right
);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
[[
deprecated
(
"The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead."
)]]
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads_and_tails
(
head_flags
,
tile_predecessor_item
,
tail_flags
,
tile_successor_item
,
input
,
flag_op
,
storage
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item.
///
/// The first item in the first thread is copied from the input then for the rest the following
/// code applies.
/// \code
/// // For each i in [1, block_size * ItemsPerThread) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param storage reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template
<
typename
Output
,
unsigned
int
ItemsPerThread
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
subtract_left
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
const
BinaryFunction
op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
false
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
false
;
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
input
,
output
,
op
,
input
[
0
]
/* predecessor */
,
storage
.
get
().
left
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, with an explicit item before
/// the tile.
///
/// \code
/// // For the first item on the first thread use the tile predecessor
/// output[0] = op(input[0], tile_predecessor)
/// // For other items, i in [1, block_size * ItemsPerThread) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_predecessor - the item before the tile, will be used as the input
/// of the first application of `op`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template
<
typename
Output
,
unsigned
int
ItemsPerThread
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
subtract_left
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
const
BinaryFunction
op
,
const
T
tile_predecessor
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
false
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
true
;
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
input
,
output
,
op
,
tile_predecessor
,
storage
.
get
().
left
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, in a partial tile.
///
/// \code
/// output[0] = input[0]
/// // For each item i in [1, valid_items) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
/// output[i] = input[i]
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template
<
typename
Output
,
unsigned
int
ItemsPerThread
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
subtract_left_partial
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
const
BinaryFunction
op
,
const
unsigned
int
valid_items
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
false
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
false
;
base_type
::
template
apply_left_partial
<
as_flags
,
reversed
,
with_predecessor
>(
input
,
output
,
op
,
input
[
0
]
/* predecessor */
,
valid_items
,
storage
.
get
().
left
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, in a partial tile with a
/// predecessor.
///
/// This combines subtract_left_partial() with a tile predecessor.
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_predecessor - the item before the tile, will be used as the input
/// of the first application of `op`
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template
<
typename
Output
,
unsigned
int
ItemsPerThread
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
subtract_left_partial
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
const
BinaryFunction
op
,
const
T
tile_predecessor
,
const
unsigned
int
valid_items
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
false
;
static
constexpr
auto
reversed
=
true
;
static
constexpr
auto
with_predecessor
=
true
;
base_type
::
template
apply_left_partial
<
as_flags
,
reversed
,
with_predecessor
>(
input
,
output
,
op
,
tile_predecessor
,
valid_items
,
storage
.
get
().
left
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item.
///
/// The last item in the last thread is copied from the input then for the rest the following
/// code applies.
/// \code
/// // For each i in [0, block_size * ItemsPerThread - 1) across threads in a block
/// output[i] = op(input[i], input[i+1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template
<
typename
Output
,
unsigned
int
ItemsPerThread
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
subtract_right
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
const
BinaryFunction
op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
false
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_successor
=
false
;
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
input
,
output
,
op
,
input
[
0
]
/* successor */
,
storage
.
get
().
right
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item, with an explicit item after
/// the tile.
///
/// \code
/// // For each items i in [0, block_size * ItemsPerThread - 1) across threads in a block
/// output[i] = op(input[i], input[i+1]);
/// // For the last item on the last thread use the tile successor
/// output[block_size * ItemsPerThread - 1] =
/// op(input[block_size * ItemsPerThread - 1], tile_successor)
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_successor - the item after the tile, will be used as the input
/// of the last application of `op`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template
<
typename
Output
,
unsigned
int
ItemsPerThread
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
subtract_right
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
const
BinaryFunction
op
,
const
T
tile_successor
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
false
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_successor
=
true
;
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
input
,
output
,
op
,
tile_successor
,
storage
.
get
().
right
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item, in a partial tile.
///
/// \code
/// // For each item i in [0, valid_items) across threads in a block
/// output[i] = op(input[i], input[i + 1]);
/// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
/// output[i] = input[i]
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template
<
typename
Output
,
unsigned
int
ItemsPerThread
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
subtract_right_partial
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
const
BinaryFunction
op
,
const
unsigned
int
valid_items
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
false
;
static
constexpr
auto
reversed
=
false
;
base_type
::
template
apply_right_partial
<
as_flags
,
reversed
>(
input
,
output
,
op
,
valid_items
,
storage
.
get
().
right
);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
3rdparty/cub/rocprim/block/block_discontinuity.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
#define ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
#include "detail/block_adjacent_difference_impl.hpp"
#include "../config.hpp"
#include "../detail/various.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_discontinuity class is a block level parallel primitive which provides
/// methods for flagging items that are discontinued within an ordered set of items across
/// threads in a block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
///
/// \par Overview
/// * There are two types of flags:
/// * Head flags.
/// * Tail flags.
/// * The above flags are used to differentiate items from their predecessors or successors.
/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
/// segmented reduction/scan.
///
/// \par Examples
/// \parblock
/// In the examples discontinuity operation is performed on block of 128 threads, using type
/// \p int.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_discontinuity
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
:
private
detail
::
block_adjacent_difference_impl
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
#endif // DOXYGEN_SHOULD_SKIP_THIS
{
using
base_type
=
detail
::
block_adjacent_difference_impl
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
;
static
constexpr
unsigned
BlockSize
=
base_type
::
BlockSize
;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct
storage_type_
{
typename
base_type
::
storage_type
left
;
typename
base_type
::
storage_type
right
;
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
detail
::
raw_storage
<
storage_type_
>
;
#else
using
storage_type
=
storage_type_
;
#endif
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item has no reference and is always
/// flagged.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_predecessor
=
false
;
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
input
,
head_flags
,
flag_op
,
input
[
0
]
/* predecessor */
,
storage
.
get
().
left
);
}
/// \overload
/// This overload does not take a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads
(
head_flags
,
input
,
flag_op
,
storage
);
}
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item of the first thread is compared against
/// a \p tile_predecessor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_predecessor
=
true
;
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
input
,
head_flags
,
flag_op
,
tile_predecessor_item
,
storage
.
get
().
left
);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads
(
head_flags
,
tile_predecessor_item
,
input
,
flag_op
,
storage
);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item has no reference and is always
/// flagged.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_tails
(
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_successor
=
false
;
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
input
,
tail_flags
,
flag_op
,
input
[
0
]
/* successor */
,
storage
.
get
().
right
);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_tails
(
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_tails
(
tail_flags
,
input
,
flag_op
,
storage
);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item of the last thread is compared against
/// a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_tails
(
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_successor
=
true
;
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
input
,
tail_flags
,
flag_op
,
tile_successor_item
,
storage
.
get
().
right
);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_tails
(
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_tails
(
tail_flags
,
tile_successor_item
,
input
,
flag_op
,
storage
);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
/// flag_op_type(), storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_predecessor
=
false
;
static
constexpr
auto
with_successor
=
false
;
// Copy items in case head_flags is aliased with input
T
items
[
ItemsPerThread
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
++
i
)
{
items
[
i
]
=
input
[
i
];
}
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
items
,
head_flags
,
flag_op
,
items
[
0
]
/*predecessor*/
,
storage
.
get
().
left
);
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
items
,
tail_flags
,
flag_op
,
items
[
0
]
/*successor*/
,
storage
.
get
().
right
);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads_and_tails
(
head_flags
,
tail_flags
,
input
,
flag_op
,
storage
);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the last item of the
/// last thread is compared against a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_predecessor
=
false
;
static
constexpr
auto
with_successor
=
true
;
// Copy items in case head_flags is aliased with input
T
items
[
ItemsPerThread
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
++
i
)
{
items
[
i
]
=
input
[
i
];
}
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
items
,
head_flags
,
flag_op
,
items
[
0
]
/*predecessor*/
,
storage
.
get
().
left
);
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
items
,
tail_flags
,
flag_op
,
tile_successor_item
,
storage
.
get
().
right
);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads_and_tails
(
head_flags
,
tail_flags
,
tile_successor_item
,
input
,
flag_op
,
storage
);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first item of the
/// first thread is compared against a \p tile_predecessor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_predecessor
=
true
;
static
constexpr
auto
with_successor
=
false
;
// Copy items in case head_flags is aliased with input
T
items
[
ItemsPerThread
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
++
i
)
{
items
[
i
]
=
input
[
i
];
}
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
items
,
head_flags
,
flag_op
,
tile_predecessor_item
,
storage
.
get
().
left
);
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
items
,
tail_flags
,
flag_op
,
items
[
0
]
/*successor*/
,
storage
.
get
().
right
);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
Flag
(
&
tail_flags
)[
ItemsPerThread
],
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads_and_tails
(
head_flags
,
tile_predecessor_item
,
tail_flags
,
input
,
flag_op
,
storage
);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first and last items of
/// the first and last thread is compared against a \p tile_predecessor_item and
/// a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_predecessor_item = 0;
/// int tile_successor_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_predecessor_item = ...
/// tile_successor_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
/// tail_flags, tile_successor_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
true
;
static
constexpr
auto
reversed
=
false
;
static
constexpr
auto
with_predecessor
=
true
;
static
constexpr
auto
with_successor
=
true
;
// Copy items in case head_flags is aliased with input
T
items
[
ItemsPerThread
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
++
i
)
{
items
[
i
]
=
input
[
i
];
}
base_type
::
template
apply_left
<
as_flags
,
reversed
,
with_predecessor
>(
items
,
head_flags
,
flag_op
,
tile_predecessor_item
,
storage
.
get
().
left
);
base_type
::
template
apply_right
<
as_flags
,
reversed
,
with_successor
>(
items
,
tail_flags
,
flag_op
,
tile_successor_item
,
storage
.
get
().
right
);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template
<
unsigned
int
ItemsPerThread
,
class
Flag
,
class
FlagOp
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
flag_heads_and_tails
(
Flag
(
&
head_flags
)[
ItemsPerThread
],
T
tile_predecessor_item
,
Flag
(
&
tail_flags
)[
ItemsPerThread
],
T
tile_successor_item
,
const
T
(
&
input
)[
ItemsPerThread
],
FlagOp
flag_op
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
flag_heads_and_tails
(
head_flags
,
tile_predecessor_item
,
tail_flags
,
tile_successor_item
,
input
,
flag_op
,
storage
);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
3rdparty/cub/rocprim/block/block_exchange.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
#define ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_exchange class is a block level parallel primitive which provides
/// methods for rearranging items partitioned across threads in a block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items contributed by each thread.
///
/// \par Overview
/// * The \p block_exchange class supports the following rearrangement methods:
/// * Transposing a blocked arrangement to a striped arrangement.
/// * Transposing a striped arrangement to a blocked arrangement.
/// * Transposing a blocked arrangement to a warp-striped arrangement.
/// * Transposing a warp-striped arrangement to a blocked arrangement.
/// * Scattering items to a blocked arrangement.
/// * Scattering items to a striped arrangement.
/// * Data is automatically be padded to ensure zero bank conflicts.
///
/// \par Examples
/// \parblock
/// In the examples exchange operation is performed on block of 128 threads, using type
/// \p int with 8 items per thread.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_striped(items, items, storage);
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_exchange
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
// Select warp size
static
constexpr
unsigned
int
warp_size
=
detail
::
get_min_warp_size
(
BlockSize
,
::
rocprim
::
device_warp_size
());
// Number of warps in block
static
constexpr
unsigned
int
warps_no
=
(
BlockSize
+
warp_size
-
1
)
/
warp_size
;
// Minimize LDS bank conflicts for power-of-two strides, i.e. when items accessed
// using `thread_id * ItemsPerThread` pattern where ItemsPerThread is power of two
// (all exchanges from/to blocked).
static
constexpr
bool
has_bank_conflicts
=
ItemsPerThread
>=
2
&&
::
rocprim
::
detail
::
is_power_of_two
(
ItemsPerThread
);
static
constexpr
unsigned
int
banks_no
=
::
rocprim
::
detail
::
get_lds_banks_no
();
static
constexpr
unsigned
int
bank_conflicts_padding
=
has_bank_conflicts
?
(
BlockSize
*
ItemsPerThread
/
banks_no
)
:
0
;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct
storage_type_
{
T
buffer
[
BlockSize
*
ItemsPerThread
+
bank_conflicts_padding
];
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
detail
::
raw_storage
<
storage_type_
>
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
/// \brief Transposes a blocked arrangement of items to a striped arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template
<
class
U
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
blocked_to_striped
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
blocked_to_striped
(
input
,
output
,
storage
);
}
/// \brief Transposes a blocked arrangement of items to a striped arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_striped(items, items, storage);
/// ...
/// }
/// \endcode
template
<
class
U
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
blocked_to_striped
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
storage_
.
buffer
[
index
(
flat_id
*
ItemsPerThread
+
i
)]
=
input
[
i
];
}
::
rocprim
::
syncthreads
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
storage_
.
buffer
[
index
(
i
*
BlockSize
+
flat_id
)];
}
}
/// \brief Transposes a striped arrangement of items to a blocked arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template
<
class
U
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
striped_to_blocked
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
striped_to_blocked
(
input
,
output
,
storage
);
}
/// \brief Transposes a striped arrangement of items to a blocked arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.striped_to_blocked(items, items, storage);
/// ...
/// }
/// \endcode
template
<
class
U
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
striped_to_blocked
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
storage_
.
buffer
[
index
(
i
*
BlockSize
+
flat_id
)]
=
input
[
i
];
}
::
rocprim
::
syncthreads
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
storage_
.
buffer
[
index
(
flat_id
*
ItemsPerThread
+
i
)];
}
}
/// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template
<
class
U
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
blocked_to_warp_striped
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
blocked_to_warp_striped
(
input
,
output
,
storage
);
}
/// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_warp_striped(items, items, storage);
/// ...
/// }
/// \endcode
template
<
class
U
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
blocked_to_warp_striped
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
storage_type
&
storage
)
{
constexpr
unsigned
int
items_per_warp
=
warp_size
*
ItemsPerThread
;
const
unsigned
int
lane_id
=
::
rocprim
::
lane_id
();
const
unsigned
int
warp_id
=
::
rocprim
::
warp_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
const
unsigned
int
current_warp_size
=
get_current_warp_size
();
const
unsigned
int
offset
=
warp_id
*
items_per_warp
;
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
storage_
.
buffer
[
index
(
offset
+
lane_id
*
ItemsPerThread
+
i
)]
=
input
[
i
];
}
::
rocprim
::
wave_barrier
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
storage_
.
buffer
[
index
(
offset
+
i
*
current_warp_size
+
lane_id
)];
}
}
/// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template
<
class
U
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
warp_striped_to_blocked
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
warp_striped_to_blocked
(
input
,
output
,
storage
);
}
/// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.warp_striped_to_blocked(items, items, storage);
/// ...
/// }
/// \endcode
template
<
class
U
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
warp_striped_to_blocked
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
storage_type
&
storage
)
{
constexpr
unsigned
int
items_per_warp
=
warp_size
*
ItemsPerThread
;
const
unsigned
int
lane_id
=
::
rocprim
::
lane_id
();
const
unsigned
int
warp_id
=
::
rocprim
::
warp_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
const
unsigned
int
current_warp_size
=
get_current_warp_size
();
const
unsigned
int
offset
=
warp_id
*
items_per_warp
;
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
storage_
.
buffer
[
index
(
offset
+
i
*
current_warp_size
+
lane_id
)]
=
input
[
i
];
}
::
rocprim
::
wave_barrier
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
storage_
.
buffer
[
index
(
offset
+
lane_id
*
ItemsPerThread
+
i
)];
}
}
/// \brief Scatters items to a blocked arrangement based on their ranks
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
template
<
class
U
,
class
Offset
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
scatter_to_blocked
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
scatter_to_blocked
(
input
,
output
,
ranks
,
storage
);
}
template
<
class
U
,
class
Offset
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
gather_from_striped
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
gather_from_striped
(
input
,
output
,
ranks
,
storage
);
}
/// \brief Scatters items to a blocked arrangement based on their ranks
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_blocked(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template
<
class
U
,
class
Offset
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
scatter_to_blocked
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
const
Offset
rank
=
ranks
[
i
];
storage_
.
buffer
[
index
(
rank
)]
=
input
[
i
];
}
::
rocprim
::
syncthreads
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
storage_
.
buffer
[
index
(
flat_id
*
ItemsPerThread
+
i
)];
}
}
template
<
class
U
,
class
Offset
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
gather_from_striped
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
storage_
.
buffer
[
index
(
i
*
BlockSize
+
flat_id
)]
=
input
[
i
];
}
::
rocprim
::
syncthreads
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
const
Offset
rank
=
ranks
[
i
];
output
[
i
]
=
storage_
.
buffer
[
index
(
rank
)];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
template
<
class
U
,
class
Offset
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
scatter_to_striped
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
scatter_to_striped
(
input
,
output
,
ranks
,
storage
);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template
<
class
U
,
class
Offset
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
scatter_to_striped
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
const
Offset
rank
=
ranks
[
i
];
storage_
.
buffer
[
rank
]
=
input
[
i
];
}
::
rocprim
::
syncthreads
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
storage_
.
buffer
[
i
*
BlockSize
+
flat_id
];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, guarded by rank.
///
/// \par Overview
/// * Items with rank -1 are not scattered.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
template
<
class
U
,
class
Offset
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
scatter_to_striped_guarded
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
scatter_to_striped_guarded
(
input
,
output
,
ranks
,
storage
);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, guarded by rank, using temporary storage.
///
/// \par Overview
/// * Items with rank -1 are not scattered.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped_guarded(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template
<
class
U
,
class
Offset
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
scatter_to_striped_guarded
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
const
Offset
rank
=
ranks
[
i
];
if
(
rank
>=
0
)
{
storage_
.
buffer
[
rank
]
=
input
[
i
];
}
}
::
rocprim
::
syncthreads
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
storage_
.
buffer
[
i
*
BlockSize
+
flat_id
];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, with a flag to denote validity.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
/// \tparam ValidFlag - [inferred] the validity flag type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] is_valid - array that has flags to denote validity.
template
<
class
U
,
class
Offset
,
class
ValidFlag
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
scatter_to_striped_flagged
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
],
const
ValidFlag
(
&
is_valid
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
scatter_to_striped_flagged
(
input
,
output
,
ranks
,
is_valid
,
storage
);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, with a flag to denote validity, using temporary
/// storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
/// \tparam ValidFlag - [inferred] the validity flag type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] is_valid - array that has flags to denote validity.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// int flags[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped_flagged(items, items, ranks, flags, storage);
/// ...
/// }
/// \endcode
template
<
class
U
,
class
Offset
,
class
ValidFlag
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
scatter_to_striped_flagged
(
const
T
(
&
input
)[
ItemsPerThread
],
U
(
&
output
)[
ItemsPerThread
],
const
Offset
(
&
ranks
)[
ItemsPerThread
],
const
ValidFlag
(
&
is_valid
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
storage_type_
&
storage_
=
storage
.
get
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
const
Offset
rank
=
ranks
[
i
];
if
(
is_valid
[
i
])
{
storage_
.
buffer
[
rank
]
=
input
[
i
];
}
}
::
rocprim
::
syncthreads
();
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
storage_
.
buffer
[
i
*
BlockSize
+
flat_id
];
}
}
private:
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
get_current_warp_size
()
const
{
const
unsigned
int
warp_id
=
::
rocprim
::
warp_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
return
(
warp_id
==
warps_no
-
1
)
?
(
BlockSize
%
warp_size
>
0
?
BlockSize
%
warp_size
:
warp_size
)
:
warp_size
;
}
// Change index to minimize LDS bank conflicts if necessary
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
index
(
unsigned
int
n
)
{
// Move every 32-bank wide "row" (32 banks * 4 bytes) by one item
return
has_bank_conflicts
?
(
n
+
n
/
banks_no
)
:
n
;
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
3rdparty/cub/rocprim/block/block_histogram.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
#define ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_histogram_atomic.hpp"
#include "detail/block_histogram_sort.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup blockmodule
/// @{
/// \brief Available algorithms for block_histogram primitive.
enum
class
block_histogram_algorithm
{
/// Atomic addition is used to update bin count directly.
/// \par Performance Notes:
/// * Performance is dependent on hardware implementation of atomic addition.
/// * Performance may decrease for non-uniform random input distributions
/// where many concurrent updates may be made to the same bin counter.
using_atomic
,
/// A two-phase operation is used:-
/// * Data is sorted using radix-sort.
/// * "Runs" of same-valued keys are detected using discontinuity; run-lengths
/// are bin counts.
/// \par Performance Notes:
/// * Performance is consistent regardless of sample bin distribution.
using_sort
,
/// \brief Default block_histogram algorithm.
default_algorithm
=
using_atomic
,
};
namespace
detail
{
// Selector for block_histogram algorithm which gives block histogram implementation
// type based on passed block_histogram_algorithm enum
template
<
block_histogram_algorithm
Algorithm
>
struct
select_block_histogram_impl
;
template
<
>
struct
select_block_histogram_impl
<
block_histogram_algorithm
::
using_atomic
>
{
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
,
unsigned
int
ItemsPerThread
,
unsigned
int
Bins
>
using
type
=
block_histogram_atomic
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
,
ItemsPerThread
,
Bins
>
;
};
template
<
>
struct
select_block_histogram_impl
<
block_histogram_algorithm
::
using_sort
>
{
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
,
unsigned
int
ItemsPerThread
,
unsigned
int
Bins
>
using
type
=
block_histogram_sort
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
,
ItemsPerThread
,
Bins
>
;
};
}
// end namespace detail
/// \brief The block_histogram class is a block level parallel primitive which provides methods
/// for constructing block-wide histograms from items partitioned across threads in a block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items to be processed by each thread.
/// \tparam Bins - the number of bins within the histogram.
/// \tparam Algorithm - selected histogram algorithm, block_histogram_algorithm::default_algorithm by default.
///
/// \par Overview
/// * block_histogram has two alternative implementations: \p block_histogram_algorithm::using_atomic
/// and block_histogram_algorithm::using_sort.
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // execute histogram
/// block_histogram_int().histogram(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
Bins
,
block_histogram_algorithm
Algorithm
=
block_histogram_algorithm
::
default_algorithm
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_histogram
#ifndef DOXYGEN_SHOULD_SKIP_THIS
:
private
detail
::
select_block_histogram_impl
<
Algorithm
>::
template
type
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
,
ItemsPerThread
,
Bins
>
#endif
{
using
base_type
=
typename
detail
::
select_block_histogram_impl
<
Algorithm
>::
template
type
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
,
ItemsPerThread
,
Bins
>;
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using
storage_type
=
typename
base_type
::
storage_type
;
/// \brief Initialize histogram counters to zero.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [out] hist - histogram bin count.
template
<
class
Counter
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
init_histogram
(
Counter
hist
[
Bins
])
{
const
auto
flat_tid
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
ROCPRIM_UNROLL
for
(
unsigned
int
offset
=
0
;
offset
<
Bins
;
offset
+=
BlockSize
)
{
const
unsigned
int
offset_tid
=
offset
+
flat_tid
;
if
(
offset_tid
<
Bins
)
{
hist
[
offset_tid
]
=
Counter
();
}
}
}
/// \brief Update an existing block-wide histogram. Each thread composites an array of
/// input elements.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // initialize histogram
/// block_histogram_int().init_histogram(
/// hist // output
/// );
///
/// rocprim::syncthreads();
///
/// // update histogram
/// block_histogram_int().composite(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
Counter
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
composite
(
T
(
&
input
)[
ItemsPerThread
],
Counter
hist
[
Bins
],
storage_type
&
storage
)
{
base_type
::
composite
(
input
,
hist
,
storage
);
}
/// \overload
/// \brief Update an existing block-wide histogram. Each thread composites an array of
/// input elements.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
template
<
class
Counter
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
composite
(
T
(
&
input
)[
ItemsPerThread
],
Counter
hist
[
Bins
])
{
base_type
::
composite
(
input
,
hist
);
}
/// \brief Construct a new block-wide histogram. Each thread contributes an array of
/// input elements.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // execute histogram
/// block_histogram_int().histogram(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
Counter
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
histogram
(
T
(
&
input
)[
ItemsPerThread
],
Counter
hist
[
Bins
],
storage_type
&
storage
)
{
init_histogram
(
hist
);
::
rocprim
::
syncthreads
();
composite
(
input
,
hist
,
storage
);
}
/// \overload
/// \brief Construct a new block-wide histogram. Each thread contributes an array of
/// input elements.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
template
<
class
Counter
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
histogram
(
T
(
&
input
)[
ItemsPerThread
],
Counter
hist
[
Bins
])
{
init_histogram
(
hist
);
::
rocprim
::
syncthreads
();
composite
(
input
,
hist
);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
3rdparty/cub/rocprim/block/block_load.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
#define ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "block_load_func.hpp"
#include "block_exchange.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief \p block_load_method enumerates the methods available to load data
/// from continuous memory into a blocked arrangement of items across the thread block
enum
class
block_load_method
{
/// Data from continuous memory is loaded into a blocked arrangement of items.
/// \par Performance Notes:
/// * Performance decreases with increasing number of items per thread (stride
/// between reads), because of reduced memory coalescing.
block_load_direct
,
/// A striped arrangement of data is read directly from memory.
block_load_striped
,
/// Data from continuous memory is loaded into a blocked arrangement of items
/// using vectorization as an optimization.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, provided that
/// vectorization requirements are fulfilled. Otherwise, performance will default
/// to \p block_load_direct.
/// \par Requirements:
/// * The input offset (\p block_input) must be quad-item aligned.
/// * The following conditions will prevent vectorization and switch to default
/// \p block_load_direct:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
block_load_vectorize
,
/// A striped arrangement of data from continuous memory is locally transposed
/// into a blocked arrangement of items.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_load_direct and
/// \p block_load_vectorize due to reordering on local memory.
block_load_transpose
,
/// A warp-striped arrangement of data from continuous memory is locally transposed
/// into a blocked arrangement of items.
/// \par Requirements:
/// * The number of threads in the block must be a multiple of the size of hardware warp.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_load_direct and
/// \p block_load_vectorize due to reordering on local memory.
block_load_warp_transpose
,
/// Defaults to \p block_load_direct
default_method
=
block_load_direct
};
/// \brief The \p block_load class is a block level parallel primitive which provides methods
/// for loading data from continuous memory into a blocked arrangement of items across the thread
/// block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items to be processed by
/// each thread.
/// \tparam Method - the method to load data.
///
/// \par Overview
/// * The \p block_load class has a number of different methods to load data:
/// * [block_load_direct](\ref ::block_load_method::block_load_direct)
/// * [block_load_striped](\ref ::block_load_method::block_load_striped)
/// * [block_load_vectorize](\ref ::block_load_method::block_load_vectorize)
/// * [block_load_transpose](\ref ::block_load_method::block_load_transpose)
/// * [block_load_warp_transpose](\ref ::block_load_method::block_load_warp_transpose)
///
/// \par Example:
/// \parblock
/// In the examples load operation is performed on block of 128 threads, using type
/// \p int and 8 items per thread.
///
/// \code{.cpp}
/// __global__ void example_kernel(int * input, ...)
/// {
/// const int offset = blockIdx.x * 128 * 8;
/// int items[8];
/// rocprim::block_load<int, 128, 8, load_method> blockload;
/// blockload.load(input + offset, items);
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
block_load_method
Method
=
block_load_method
::
block_load_direct
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_load
{
private:
using
storage_type_
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords \p __shared__. It can be aliased to
/// an externally allocated memory, or be a part of a union with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
])
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_blocked
(
flat_id
,
block_input
,
items
);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range \p valid.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_blocked
(
flat_id
,
block_input
,
items
,
valid
);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range with a fall-back value for out-of-bound
/// elements.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
/// \tparam Default - [inferred] The data type of the default value.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] out_of_bounds - default value assigned to out-of-bound items.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_blocked
(
flat_id
,
block_input
,
items
,
valid
,
out_of_bounds
);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// __shared__ typename block_load_int::storage_type storage;
/// bload.load(..., items, storage);
/// ...
/// }
/// \endcode
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
load
(
block_input
,
items
);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range \p valid, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// tile_static typename block_load_int::storage_type storage;
/// bload.load(..., items, valid, storage);
/// ...
/// }
/// \endcode
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
load
(
block_input
,
items
,
valid
);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range with a fall-back value for out-of-bound
/// elements, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
/// \tparam Default - [inferred] The data type of the default value.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] out_of_bounds - default value assigned to out-of-bound items.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// __shared__ typename block_load_int::storage_type storage;
/// bload.load(..., items, valid, out_of_bounds, storage);
/// ...
/// }
/// \endcode
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
load
(
block_input
,
items
,
valid
,
out_of_bounds
);
}
};
/// @}
// end of group blockmodule
#ifndef DOXYGEN_SHOULD_SKIP_THIS
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
class
block_load
<
T
,
BlockSizeX
,
ItemsPerThread
,
block_load_method
::
block_load_striped
,
BlockSizeY
,
BlockSizeZ
>
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
private:
using
storage_type_
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
template
<
class
InputIterator
>
ROCPRIM_DEVICE
inline
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
])
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
inline
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
);
}
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
inline
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
,
out_of_bounds
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
inline
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
inline
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
);
}
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
inline
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
,
out_of_bounds
);
}
};
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
class
block_load
<
T
,
BlockSizeX
,
ItemsPerThread
,
block_load_method
::
block_load_vectorize
,
BlockSizeY
,
BlockSizeZ
>
{
private:
using
storage_type_
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
T
*
block_input
,
T
(
&
_items
)[
ItemsPerThread
])
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_blocked_vectorized
(
flat_id
,
block_input
,
_items
);
}
template
<
class
InputIterator
,
class
U
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
U
(
&
items
)[
ItemsPerThread
])
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_blocked
(
flat_id
,
block_input
,
items
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_blocked
(
flat_id
,
block_input
,
items
,
valid
);
}
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_blocked
(
flat_id
,
block_input
,
items
,
valid
,
out_of_bounds
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
T
*
block_input
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
(
void
)
storage
;
load
(
block_input
,
items
);
}
template
<
class
InputIterator
,
class
U
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
U
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
load
(
block_input
,
items
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
load
(
block_input
,
items
,
valid
);
}
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
(
void
)
storage
;
load
(
block_input
,
items
,
valid
,
out_of_bounds
);
}
};
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
class
block_load
<
T
,
BlockSizeX
,
ItemsPerThread
,
block_load_method
::
block_load_transpose
,
BlockSizeY
,
BlockSizeZ
>
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
private:
using
block_exchange_type
=
block_exchange
<
T
,
BlockSize
,
ItemsPerThread
>
;
public:
using
storage_type
=
typename
block_exchange_type
::
storage_type
;
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
])
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
);
block_exchange_type
().
striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
);
block_exchange_type
().
striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
,
out_of_bounds
);
block_exchange_type
().
striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
);
block_exchange_type
().
striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
);
block_exchange_type
().
striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
,
out_of_bounds
);
block_exchange_type
().
striped_to_blocked
(
items
,
items
,
storage
);
}
};
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
class
block_load
<
T
,
BlockSizeX
,
ItemsPerThread
,
block_load_method
::
block_load_warp_transpose
,
BlockSizeY
,
BlockSizeZ
>
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
private:
using
block_exchange_type
=
block_exchange
<
T
,
BlockSizeX
,
ItemsPerThread
,
BlockSizeY
,
BlockSizeZ
>
;
public:
static_assert
(
BlockSize
%
::
rocprim
::
device_warp_size
()
==
0
,
"BlockSize must be a multiple of hardware warpsize"
);
using
storage_type
=
typename
block_exchange_type
::
storage_type
;
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
])
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_warp_striped
(
flat_id
,
block_input
,
items
);
block_exchange_type
().
warp_striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_warp_striped
(
flat_id
,
block_input
,
items
,
valid
);
block_exchange_type
().
warp_striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_warp_striped
(
flat_id
,
block_input
,
items
,
valid
,
out_of_bounds
);
block_exchange_type
().
warp_striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_warp_striped
(
flat_id
,
block_input
,
items
);
block_exchange_type
().
warp_striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_warp_striped
(
flat_id
,
block_input
,
items
,
valid
);
block_exchange_type
().
warp_striped_to_blocked
(
items
,
items
,
storage
);
}
template
<
class
InputIterator
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
load
(
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
,
storage_type
&
storage
)
{
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
static_assert
(
std
::
is_convertible
<
value_type
,
T
>::
value
,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T."
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_load_direct_warp_striped
(
flat_id
,
block_input
,
items
,
valid
,
out_of_bounds
);
block_exchange_type
().
warp_striped_to_blocked
(
items
,
items
,
storage
);
}
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
3rdparty/cub/rocprim/block/block_load_func.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
#define ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup blockmodule
/// @{
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template
<
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_blocked
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
])
{
unsigned
int
offset
=
flat_id
*
ItemsPerThread
;
InputIterator
thread_iter
=
block_input
+
offset
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
items
[
item
]
=
thread_iter
[
item
];
}
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template
<
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_blocked
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
unsigned
int
offset
=
flat_id
*
ItemsPerThread
;
InputIterator
thread_iter
=
block_input
+
offset
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
if
(
item
+
offset
<
valid
)
{
items
[
item
]
=
thread_iter
[
item
];
}
}
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template
<
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_blocked
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
)
{
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
items
[
item
]
=
static_cast
<
T
>
(
out_of_bounds
);
}
// TODO: Consider using std::fill for HIP-CPU, as uses memset() where appropriate
block_load_direct_blocked
(
flat_id
,
block_input
,
items
,
valid
);
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// The input offset (\p block_input + offset) must be quad-item aligned.
///
/// The following conditions will prevent vectorization and switch to default
/// block_load_direct_blocked:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
///
/// \tparam T - [inferred] the input data type
/// \tparam U - [inferred] the output data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// The type \p T must be such that it can be implicitly converted to \p U.
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template
<
class
T
,
class
U
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
block_load_direct_blocked_vectorized
(
unsigned
int
flat_id
,
T
*
block_input
,
U
(
&
items
)[
ItemsPerThread
])
->
typename
std
::
enable_if
<
detail
::
is_vectorizable
<
T
,
ItemsPerThread
>::
value
>::
type
{
typedef
typename
detail
::
match_vector_type
<
T
,
ItemsPerThread
>::
type
vector_type
;
constexpr
unsigned
int
vectors_per_thread
=
(
sizeof
(
T
)
*
ItemsPerThread
)
/
sizeof
(
vector_type
);
vector_type
vector_items
[
vectors_per_thread
];
const
vector_type
*
vector_ptr
=
reinterpret_cast
<
const
vector_type
*>
(
block_input
)
+
(
flat_id
*
vectors_per_thread
);
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
vectors_per_thread
;
item
++
)
{
vector_items
[
item
]
=
*
(
vector_ptr
+
item
);
}
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
items
[
item
]
=
*
(
reinterpret_cast
<
T
*>
(
vector_items
)
+
item
);
}
}
template
<
class
T
,
class
U
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
block_load_direct_blocked_vectorized
(
unsigned
int
flat_id
,
T
*
block_input
,
U
(
&
items
)[
ItemsPerThread
])
->
typename
std
::
enable_if
<!
detail
::
is_vectorizable
<
T
,
ItemsPerThread
>::
value
>::
type
{
block_load_direct_blocked
(
flat_id
,
block_input
,
items
);
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template
<
unsigned
int
BlockSize
,
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_striped
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
])
{
InputIterator
thread_iter
=
block_input
+
flat_id
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
items
[
item
]
=
thread_iter
[
item
*
BlockSize
];
}
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template
<
unsigned
int
BlockSize
,
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_striped
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
InputIterator
thread_iter
=
block_input
+
flat_id
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
unsigned
int
offset
=
item
*
BlockSize
;
if
(
flat_id
+
offset
<
valid
)
{
items
[
item
]
=
thread_iter
[
offset
];
}
}
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template
<
unsigned
int
BlockSize
,
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_striped
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
)
{
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
items
[
item
]
=
out_of_bounds
;
}
block_load_direct_striped
<
BlockSize
>
(
flat_id
,
block_input
,
items
,
valid
);
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template
<
unsigned
int
WarpSize
=
device_warp_size
(),
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_warp_striped
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
])
{
static_assert
(
detail
::
is_power_of_two
(
WarpSize
)
&&
WarpSize
<=
device_warp_size
(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp."
);
unsigned
int
thread_id
=
detail
::
logical_lane_id
<
WarpSize
>
();
unsigned
int
warp_id
=
flat_id
/
WarpSize
;
unsigned
int
warp_offset
=
warp_id
*
WarpSize
*
ItemsPerThread
;
InputIterator
thread_iter
=
block_input
+
thread_id
+
warp_offset
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
items
[
item
]
=
thread_iter
[
item
*
WarpSize
];
}
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template
<
unsigned
int
WarpSize
=
device_warp_size
(),
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_warp_striped
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
static_assert
(
detail
::
is_power_of_two
(
WarpSize
)
&&
WarpSize
<=
device_warp_size
(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp."
);
unsigned
int
thread_id
=
detail
::
logical_lane_id
<
WarpSize
>
();
unsigned
int
warp_id
=
flat_id
/
WarpSize
;
unsigned
int
warp_offset
=
warp_id
*
WarpSize
*
ItemsPerThread
;
InputIterator
thread_iter
=
block_input
+
thread_id
+
warp_offset
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
unsigned
int
offset
=
item
*
WarpSize
;
if
(
warp_offset
+
thread_id
+
offset
<
valid
)
{
items
[
item
]
=
thread_iter
[
offset
];
}
}
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template
<
unsigned
int
WarpSize
=
device_warp_size
(),
class
InputIterator
,
class
T
,
unsigned
int
ItemsPerThread
,
class
Default
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_load_direct_warp_striped
(
unsigned
int
flat_id
,
InputIterator
block_input
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
Default
out_of_bounds
)
{
static_assert
(
detail
::
is_power_of_two
(
WarpSize
)
&&
WarpSize
<=
device_warp_size
(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp."
);
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
items
[
item
]
=
out_of_bounds
;
}
block_load_direct_warp_striped
<
WarpSize
>
(
flat_id
,
block_input
,
items
,
valid
);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
3rdparty/cub/rocprim/block/block_radix_sort.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
#define ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/radix_sort.hpp"
#include "../warp/detail/warp_scan_crosslane.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "block_exchange.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
/// Specialized block scan of bool (1 bit values)
/// It uses warp scan and reduce functions of bool (1 bit values) based on ballot and bit count.
/// They have much better performance (several times faster) than generic scan and reduce classes
/// because of using hardware ability to calculate which lanes have true predicate values.
template
<
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_bit_plus_scan
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
// Select warp size
static
constexpr
unsigned
int
warp_size
=
detail
::
get_min_warp_size
(
BlockSize
,
::
rocprim
::
device_warp_size
());
// Number of warps in block
static
constexpr
unsigned
int
warps_no
=
(
BlockSize
+
warp_size
-
1
)
/
warp_size
;
// typedef of warp_scan primitive that will be used to get prefix values for
// each warp (scanned carry-outs from warps before it)
// warp_scan_crosslane is an implementation of warp_scan that does not need storage,
// but requires logical warp size to be a power of two.
using
warp_scan_prefix_type
=
::
rocprim
::
detail
::
warp_scan_crosslane
<
unsigned
int
,
detail
::
next_power_of_two
(
warps_no
)
>
;
public:
struct
storage_type_
{
unsigned
int
warp_prefixes
[
warps_no
];
// ---------- Shared memory optimisation ----------
// Since we use warp_scan_crosslane for warp scan, we don't need to allocate
// any temporary memory for it.
};
using
storage_type
=
detail
::
raw_storage
<
storage_type_
>
;
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exclusive_scan
(
const
unsigned
int
(
&
input
)[
ItemsPerThread
],
unsigned
int
(
&
output
)[
ItemsPerThread
],
unsigned
int
&
reduction
,
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
const
unsigned
int
lane_id
=
::
rocprim
::
lane_id
();
const
unsigned
int
warp_id
=
::
rocprim
::
warp_id
(
flat_id
);
storage_type_
&
storage_
=
storage
.
get
();
unsigned
int
warp_reduction
=
::
rocprim
::
bit_count
(
::
rocprim
::
ballot
(
input
[
0
]));
for
(
unsigned
int
i
=
1
;
i
<
ItemsPerThread
;
i
++
)
{
warp_reduction
+=
::
rocprim
::
bit_count
(
::
rocprim
::
ballot
(
input
[
i
]));
}
if
(
lane_id
==
0
)
{
storage_
.
warp_prefixes
[
warp_id
]
=
warp_reduction
;
}
::
rocprim
::
syncthreads
();
// Scan the warp reduction results to calculate warp prefixes
if
(
flat_id
<
warps_no
)
{
unsigned
int
prefix
=
storage_
.
warp_prefixes
[
flat_id
];
warp_scan_prefix_type
().
inclusive_scan
(
prefix
,
prefix
,
::
rocprim
::
plus
<
unsigned
int
>
());
storage_
.
warp_prefixes
[
flat_id
]
=
prefix
;
}
#ifdef __HIP_CPU_RT__
else
{
// HIP-CPU doesn't implement lockstep behavior. Need to invoke the same number sync ops in divergent branch.
empty_type
empty
;
::
rocprim
::
detail
::
warp_scan_crosslane
<
empty_type
,
detail
::
next_power_of_two
(
warps_no
)
>
().
inclusive_scan
(
empty
,
empty
,
empty_binary_op
{});
}
#endif
::
rocprim
::
syncthreads
();
// Perform exclusive warp scan of bit values
unsigned
int
lane_prefix
=
0
;
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
lane_prefix
=
::
rocprim
::
masked_bit_count
(
::
rocprim
::
ballot
(
input
[
i
]),
lane_prefix
);
}
// Scan the lane's items and calculate final scan results
output
[
0
]
=
warp_id
==
0
?
lane_prefix
:
lane_prefix
+
storage_
.
warp_prefixes
[
warp_id
-
1
];
for
(
unsigned
int
i
=
1
;
i
<
ItemsPerThread
;
i
++
)
{
output
[
i
]
=
output
[
i
-
1
]
+
input
[
i
-
1
];
}
// Get the final inclusive reduction result
reduction
=
storage_
.
warp_prefixes
[
warps_no
-
1
];
}
};
}
// end namespace detail
/// \brief The block_radix_sort class is a block level parallel primitive which provides
/// methods sorting items (keys or key-value pairs) partitioned across threads in a block
/// using radix sort algorithm.
///
/// \tparam Key - the key type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items contributed by each thread.
/// \tparam Value - the value type. Default type empty_type indicates
/// a keys-only sort.
///
/// \par Overview
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Performance depends on \p BlockSize and \p ItemsPerThread.
/// * It is usually better of \p BlockSize is a multiple of the size of the hardware warp.
/// * It is usually increased when \p ItemsPerThread is greater than one. However, when there
/// are too many items per thread, each thread may need so much registers and/or shared memory
/// that occupancy will fall too low, decreasing the performance.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 256 threads, each thread provides
/// eight \p int value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int, block of 256 threads,
/// // and eight items per thread; key-only sort
/// using block_rsort_int = rocprim::block_radix_sort<int, 256, 8>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_int::storage_type storage;
///
/// int input[8] = ...;
/// // execute block radix sort (ascending)
/// block_rsort_int().sort(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
Key
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
class
Value
=
empty_type
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_radix_sort
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
static
constexpr
bool
with_values
=
!
std
::
is_same
<
Value
,
empty_type
>::
value
;
using
bit_key_type
=
typename
::
rocprim
::
detail
::
radix_key_codec
<
Key
>::
bit_key_type
;
using
bit_block_scan
=
detail
::
block_bit_plus_scan
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
;
using
bit_keys_exchange_type
=
::
rocprim
::
block_exchange
<
bit_key_type
,
BlockSizeX
,
ItemsPerThread
,
BlockSizeY
,
BlockSizeZ
>
;
using
values_exchange_type
=
::
rocprim
::
block_exchange
<
Value
,
BlockSizeX
,
ItemsPerThread
,
BlockSizeY
,
BlockSizeZ
>
;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct
storage_type_
{
union
{
typename
bit_keys_exchange_type
::
storage_type
bit_keys_exchange
;
typename
values_exchange_type
::
storage_type
values_exchange
;
};
typename
block_radix_sort
<
Key
,
BlockSizeX
,
ItemsPerThread
,
Value
,
BlockSizeY
,
BlockSizeZ
>::
bit_block_scan
::
storage_type
bit_block_scan
;
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
detail
::
raw_storage
<
storage_type_
>
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
/// \brief Performs ascending radix sort over keys partitioned across threads in a block.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two \p float value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for float, block of 128 threads,
/// // and two items per thread; key-only sort
/// using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_float::storage_type storage;
///
/// float input[2] = ...;
/// // execute block radix sort (ascending)
/// block_rsort_float().sort(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}}</tt>, then
/// then after sort they will be equal <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt>.
/// \endparblock
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort
(
Key
(
&
keys
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
empty_type
values
[
ItemsPerThread
];
sort_impl
<
false
>
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \overload
/// \brief Performs ascending radix sort over keys partitioned across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort
(
Key
(
&
keys
)[
ItemsPerThread
],
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
sort
(
keys
,
storage
,
begin_bit
,
end_bit
);
}
/// \brief Performs descending radix sort over keys partitioned across threads in a block.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two \p float value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for float, block of 128 threads,
/// // and two items per thread; key-only sort
/// using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_float::storage_type storage;
///
/// float input[2] = ...;
/// // execute block radix sort (descending)
/// block_rsort_float().sort_desc(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt>,
/// then after sort they will be equal <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt>.
/// \endparblock
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort_desc
(
Key
(
&
keys
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
empty_type
values
[
ItemsPerThread
];
sort_impl
<
true
>
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \overload
/// \brief Performs descending radix sort over keys partitioned across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort_desc
(
Key
(
&
keys
)[
ItemsPerThread
],
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
sort_desc
(
keys
,
storage
,
begin_bit
,
end_bit
);
}
/// \brief Performs ascending radix sort over key-value pairs partitioned across
/// threads in a block.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
/// arrays as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int-float pairs, block of 128
/// // threads, and two items per thread
/// using block_rsort_ii = rocprim::block_radix_sort<int, 128, 2, int>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_ii::storage_type storage;
///
/// int keys[2] = ...;
/// float values[2] = ...;
/// // execute block radix sort-by-key (ascending)
/// block_rsort_ii().sort(
/// keys, values,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p keys across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt> and
/// the \p values are <tt>{[1, 1], [2, 2] ..., [128, 128]}</tt>, then after sort the \p keys
/// will be equal <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt> and the \p values will be
/// equal <tt>{[128, 128], [127, 127] ..., [2, 2], [1, 1]}</tt>.
/// \endparblock
template
<
bool
WithValues
=
with_values
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort
(
Key
(
&
keys
)[
ItemsPerThread
],
typename
std
::
enable_if
<
WithValues
,
Value
>::
type
(
&
values
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
sort_impl
<
false
>
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \overload
/// \brief Performs ascending radix sort over key-value pairs partitioned across
/// threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
template
<
bool
WithValues
=
with_values
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort
(
Key
(
&
keys
)[
ItemsPerThread
],
typename
std
::
enable_if
<
WithValues
,
Value
>::
type
(
&
values
)[
ItemsPerThread
],
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
sort
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \brief Performs descending radix sort over key-value pairs partitioned across
/// threads in a block.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
/// arrays as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int-float pairs, block of 128
/// // threads, and two items per thread
/// using block_rsort_ii = rocprim::block_radix_sort<int, 128, 2, int>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_ii::storage_type storage;
///
/// int keys[2] = ...;
/// float values[2] = ...;
/// // execute block radix sort-by-key (descending)
/// block_rsort_ii().sort_desc(
/// keys, values,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p keys across threads in a block are <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt> and
/// the \p values are <tt>{[128, 128], [127, 127] ..., [2, 2], [1, 1]}</tt>, then after sort
/// the \p keys will be equal <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt> and the \p values
/// will be equal <tt>{[1, 1], [2, 2] ..., [128, 128]}</tt>.
/// \endparblock
template
<
bool
WithValues
=
with_values
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort_desc
(
Key
(
&
keys
)[
ItemsPerThread
],
typename
std
::
enable_if
<
WithValues
,
Value
>::
type
(
&
values
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
sort_impl
<
true
>
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \overload
/// \brief Performs descending radix sort over key-value pairs partitioned across
/// threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
template
<
bool
WithValues
=
with_values
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort_desc
(
Key
(
&
keys
)[
ItemsPerThread
],
typename
std
::
enable_if
<
WithValues
,
Value
>::
type
(
&
values
)[
ItemsPerThread
],
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
sort_desc
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \brief Performs ascending radix sort over keys partitioned across threads in a block,
/// results are saved in a striped arrangement.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two \p float value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for float, block of 128 threads,
/// // and two items per thread; key-only sort
/// using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_float::storage_type storage;
///
/// float keys[2] = ...;
/// // execute block radix sort (ascending)
/// block_rsort_float().sort_to_striped(
/// keys,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}}</tt>, then
/// then after sort they will be equal <tt>{[1, 129], [2, 130] ..., [128, 256]}</tt>.
/// \endparblock
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort_to_striped
(
Key
(
&
keys
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
empty_type
values
[
ItemsPerThread
];
sort_impl
<
false
,
true
>
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \overload
/// \brief Performs ascending radix sort over keys partitioned across threads in a block,
/// results are saved in a striped arrangement.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort_to_striped
(
Key
(
&
keys
)[
ItemsPerThread
],
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
sort_to_striped
(
keys
,
storage
,
begin_bit
,
end_bit
);
}
/// \brief Performs descending radix sort over keys partitioned across threads in a block,
/// results are saved in a striped arrangement.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two \p float value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for float, block of 128 threads,
/// // and two items per thread; key-only sort
/// using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_float::storage_type storage;
///
/// float input[2] = ...;
/// // execute block radix sort (descending)
/// block_rsort_float().sort_desc_to_striped(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt>,
/// then after sort they will be equal <tt>{[256, 128], ..., [130, 2], [129, 1]}</tt>.
/// \endparblock
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort_desc_to_striped
(
Key
(
&
keys
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
empty_type
values
[
ItemsPerThread
];
sort_impl
<
true
,
true
>
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \overload
/// \brief Performs descending radix sort over keys partitioned across threads in a block,
/// results are saved in a striped arrangement.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort_desc_to_striped
(
Key
(
&
keys
)[
ItemsPerThread
],
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
sort_desc_to_striped
(
keys
,
storage
,
begin_bit
,
end_bit
);
}
/// \brief Performs ascending radix sort over key-value pairs partitioned across
/// threads in a block, results are saved in a striped arrangement.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 4 threads, each thread provides
/// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
/// arrays as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int-float pairs, block of 4
/// // threads, and two items per thread
/// using block_rsort_ii = rocprim::block_radix_sort<int, 4, 2, int>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_ii::storage_type storage;
///
/// int keys[2] = ...;
/// float values[2] = ...;
/// // execute block radix sort-by-key (ascending)
/// block_rsort_ii().sort_to_striped(
/// keys, values,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p keys across threads in a block are <tt>{[8, 7], [6, 5], [4, 3], [2, 1]}</tt> and
/// the \p values are <tt>{[-1, -2], [-3, -4], [-5, -6], [-7, -8]}</tt>, then after sort the
/// \p keys will be equal <tt>{[1, 5], [2, 6], [3, 7], [4, 8]}</tt> and the \p values will be
/// equal <tt>{[-8, -4], [-7, -3], [-6, -2], [-5, -1]}</tt>.
/// \endparblock
template
<
bool
WithValues
=
with_values
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort_to_striped
(
Key
(
&
keys
)[
ItemsPerThread
],
typename
std
::
enable_if
<
WithValues
,
Value
>::
type
(
&
values
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
sort_impl
<
false
,
true
>
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \overload
/// \brief Performs ascending radix sort over key-value pairs partitioned across
/// threads in a block, results are saved in a striped arrangement.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
template
<
bool
WithValues
=
with_values
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort_to_striped
(
Key
(
&
keys
)[
ItemsPerThread
],
typename
std
::
enable_if
<
WithValues
,
Value
>::
type
(
&
values
)[
ItemsPerThread
],
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
sort_to_striped
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \brief Performs descending radix sort over key-value pairs partitioned across
/// threads in a block, results are saved in a striped arrangement.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 4 threads, each thread provides
/// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
/// arrays as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int-float pairs, block of 4
/// // threads, and two items per thread
/// using block_rsort_ii = rocprim::block_radix_sort<int, 4, 2, int>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_ii::storage_type storage;
///
/// int keys[2] = ...;
/// float values[2] = ...;
/// // execute block radix sort-by-key (descending)
/// block_rsort_ii().sort_desc_to_striped(
/// keys, values,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p keys across threads in a block are <tt>{[1, 2], [3, 4], [5, 6], [7, 8]}</tt> and
/// the \p values are <tt>{[80, 70], [60, 50], [40, 30], [20, 10]}</tt>, then after sort the
/// \p keys will be equal <tt>{[8, 4], [7, 3], [6, 2], [5, 1]}</tt> and the \p values will be
/// equal <tt>{[10, 50], [20, 60], [30, 70], [40, 80]}</tt>.
/// \endparblock
template
<
bool
WithValues
=
with_values
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort_desc_to_striped
(
Key
(
&
keys
)[
ItemsPerThread
],
typename
std
::
enable_if
<
WithValues
,
Value
>::
type
(
&
values
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
sort_impl
<
true
,
true
>
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
/// \overload
/// \brief Performs descending radix sort over key-value pairs partitioned across
/// threads in a block, results are saved in a striped arrangement.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>.
template
<
bool
WithValues
=
with_values
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort_desc_to_striped
(
Key
(
&
keys
)[
ItemsPerThread
],
typename
std
::
enable_if
<
WithValues
,
Value
>::
type
(
&
values
)[
ItemsPerThread
],
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
))
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
sort_desc_to_striped
(
keys
,
values
,
storage
,
begin_bit
,
end_bit
);
}
private:
template
<
bool
Descending
,
bool
ToStriped
=
false
,
class
SortedValue
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort_impl
(
Key
(
&
keys
)[
ItemsPerThread
],
SortedValue
(
&
values
)[
ItemsPerThread
],
storage_type
&
storage
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
)
{
using
key_codec
=
::
rocprim
::
detail
::
radix_key_codec
<
Key
,
Descending
>
;
storage_type_
&
storage_
=
storage
.
get
();
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
bit_key_type
bit_keys
[
ItemsPerThread
];
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
bit_keys
[
i
]
=
key_codec
::
encode
(
keys
[
i
]);
}
// Use binary digits (i.e. digits can be 0 or 1)
for
(
unsigned
int
bit
=
begin_bit
;
bit
<
end_bit
;
bit
++
)
{
unsigned
int
bits
[
ItemsPerThread
];
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
bits
[
i
]
=
key_codec
::
extract_digit
(
bit_keys
[
i
],
bit
,
1
);
}
unsigned
int
ranks
[
ItemsPerThread
];
#ifdef __HIP_CPU_RT__
// TODO: Check if really necessary
// Initialize contents, as non-hipcc compilers don't unconditionally zero out allocated memory
std
::
memset
(
ranks
,
0
,
ItemsPerThread
*
sizeof
(
decltype
(
ranks
[
0
])));
#endif
unsigned
int
count
;
bit_block_scan
().
exclusive_scan
(
bits
,
ranks
,
count
,
storage_
.
bit_block_scan
);
// Scatter keys to computed positions considering starting positions of their digit values
const
unsigned
int
start
=
BlockSize
*
ItemsPerThread
-
count
;
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
// Calculate position for the first digit (0) value based on positions of the second (1)
ranks
[
i
]
=
bits
[
i
]
!=
0
?
(
start
+
ranks
[
i
])
:
(
flat_id
*
ItemsPerThread
+
i
-
ranks
[
i
]);
}
exchange_keys
(
storage
,
bit_keys
,
ranks
);
exchange_values
(
storage
,
values
,
ranks
);
}
if
(
ToStriped
)
{
to_striped_keys
(
storage
,
bit_keys
);
to_striped_values
(
storage
,
values
);
}
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
;
i
++
)
{
keys
[
i
]
=
key_codec
::
decode
(
bit_keys
[
i
]);
}
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exchange_keys
(
storage_type
&
storage
,
bit_key_type
(
&
bit_keys
)[
ItemsPerThread
],
const
unsigned
int
(
&
ranks
)[
ItemsPerThread
])
{
storage_type_
&
storage_
=
storage
.
get
();
// Synchronization is omitted here because bit_block_scan already calls it
bit_keys_exchange_type
().
scatter_to_blocked
(
bit_keys
,
bit_keys
,
ranks
,
storage_
.
bit_keys_exchange
);
}
template
<
class
SortedValue
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exchange_values
(
storage_type
&
storage
,
SortedValue
(
&
values
)[
ItemsPerThread
],
const
unsigned
int
(
&
ranks
)[
ItemsPerThread
])
{
storage_type_
&
storage_
=
storage
.
get
();
::
rocprim
::
syncthreads
();
// Storage will be reused (union), synchronization is needed
values_exchange_type
().
scatter_to_blocked
(
values
,
values
,
ranks
,
storage_
.
values_exchange
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exchange_values
(
storage_type
&
storage
,
empty_type
(
&
values
)[
ItemsPerThread
],
const
unsigned
int
(
&
ranks
)[
ItemsPerThread
])
{
(
void
)
storage
;
(
void
)
values
;
(
void
)
ranks
;
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
to_striped_keys
(
storage_type
&
storage
,
bit_key_type
(
&
bit_keys
)[
ItemsPerThread
])
{
storage_type_
&
storage_
=
storage
.
get
();
::
rocprim
::
syncthreads
();
bit_keys_exchange_type
().
blocked_to_striped
(
bit_keys
,
bit_keys
,
storage_
.
bit_keys_exchange
);
}
template
<
class
SortedValue
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
to_striped_values
(
storage_type
&
storage
,
SortedValue
(
&
values
)[
ItemsPerThread
])
{
storage_type_
&
storage_
=
storage
.
get
();
::
rocprim
::
syncthreads
();
// Storage will be reused (union), synchronization is needed
values_exchange_type
().
blocked_to_striped
(
values
,
values
,
storage_
.
values_exchange
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
to_striped_values
(
storage_type
&
storage
,
empty_type
*
values
)
{
(
void
)
storage
;
(
void
)
values
;
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
3rdparty/cub/rocprim/block/block_reduce.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
#define ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_reduce_warp_reduce.hpp"
#include "detail/block_reduce_raking_reduce.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Available algorithms for block_reduce primitive.
enum
class
block_reduce_algorithm
{
/// \brief A warp_reduce based algorithm.
using_warp_reduce
,
/// \brief An algorithm which limits calculations to a single hardware warp.
raking_reduce
,
/// \brief raking reduce that supports only commutative operators
raking_reduce_commutative_only
,
/// \brief Default block_reduce algorithm.
default_algorithm
=
using_warp_reduce
,
};
namespace
detail
{
// Selector for block_reduce algorithm which gives block reduce implementation
// type based on passed block_reduce_algorithm enum
template
<
block_reduce_algorithm
Algorithm
>
struct
select_block_reduce_impl
;
template
<
>
struct
select_block_reduce_impl
<
block_reduce_algorithm
::
using_warp_reduce
>
{
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
using
type
=
block_reduce_warp_reduce
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
;
};
template
<
>
struct
select_block_reduce_impl
<
block_reduce_algorithm
::
raking_reduce
>
{
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
using
type
=
block_reduce_raking_reduce
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
;
};
template
<
>
struct
select_block_reduce_impl
<
block_reduce_algorithm
::
raking_reduce_commutative_only
>
{
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
using
type
=
block_reduce_raking_reduce
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
,
true
>
;
};
}
// end namespace detail
/// \brief The block_reduce class is a block level parallel primitive which provides methods
/// for performing reductions operations on items partitioned across threads in a block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam Algorithm - selected reduce algorithm, block_reduce_algorithm::default_algorithm by default.
///
/// \par Overview
/// * Supports non-commutative reduce operators. However, a reduce operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Computation can more efficient when:
/// * \p ItemsPerThread is greater than one,
/// * \p T is an arithmetic type,
/// * reduce operation is simple addition operator, and
/// * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
/// * block_reduce has two alternative implementations: \p block_reduce_algorithm::using_warp_reduce,
/// block_reduce_algorithm::raking_reduce and block_reduce_algorithm::raking_reduce_commutative_only.
/// * If the block sizes less than 64 only one warp reduction is used. The block reduction algorithm
/// stores the result only in the first thread(lane_id = 0 warp_id = 0), when the block size is
/// larger then the warp size.
///
/// \par Examples
/// \parblock
/// In the examples reduce operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize warp_reduce for int and logical warp of 192 threads
/// using block_reduce_int = rocprim::block_reduce<int, 192>;
/// // allocate storage in shared memory
/// __shared__ block_reduce_int::storage_type storage;
///
/// int value = ...;
/// // execute reduce
/// block_reduce_int().reduce(
/// value, // input
/// value, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
block_reduce_algorithm
Algorithm
=
block_reduce_algorithm
::
default_algorithm
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_reduce
#ifndef DOXYGEN_SHOULD_SKIP_THIS
:
private
detail
::
select_block_reduce_impl
<
Algorithm
>::
template
type
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
#endif
{
using
base_type
=
typename
detail
::
select_block_reduce_impl
<
Algorithm
>::
template
type
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using
storage_type
=
typename
base_type
::
storage_type
;
/// \brief Performs reduction across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present min reduce operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_reduce for float and block of 256 threads
/// using block_reduce_f = rocprim::block_reduce<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_reduce_float::storage_type storage;
///
/// float input = ...;
/// float output;
/// // execute min reduce
/// block_reduce_float().reduce(
/// input,
/// output,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
/// \p output value will be <tt>{-256}</tt>.
/// \endparblock
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
reduce
(
T
input
,
T
&
output
,
storage_type
&
storage
,
BinaryFunction
reduce_op
=
BinaryFunction
())
{
base_type
::
reduce
(
input
,
output
,
storage
,
reduce_op
);
}
/// \overload
/// \brief Performs reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
reduce
(
T
input
,
T
&
output
,
BinaryFunction
reduce_op
=
BinaryFunction
())
{
base_type
::
reduce
(
input
,
output
,
reduce_op
);
}
/// \brief Performs reduction across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present maximum reduce operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_reduce for long and block of 128 threads
/// using block_reduce_f = rocprim::block_reduce<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_reduce_long::storage_type storage;
///
/// long input[2] = ...;
/// long output[2];
/// // execute max reduce
/// block_reduce_long().reduce(
/// input,
/// output,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
/// \p output value will be <tt>{256}</tt>.
/// \endparblock
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
reduce
(
T
(
&
input
)[
ItemsPerThread
],
T
&
output
,
storage_type
&
storage
,
BinaryFunction
reduce_op
=
BinaryFunction
())
{
base_type
::
reduce
(
input
,
output
,
storage
,
reduce_op
);
}
/// \overload
/// \brief Performs reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
reduce
(
T
(
&
input
)[
ItemsPerThread
],
T
&
output
,
BinaryFunction
reduce_op
=
BinaryFunction
())
{
base_type
::
reduce
(
input
,
output
,
reduce_op
);
}
/// \brief Performs reduction across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] valid_items - number of items that will be reduced in the block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present min reduce operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_reduce for float and block of 256 threads
/// using block_reduce_f = rocprim::block_reduce<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_reduce_float::storage_type storage;
///
/// float input = ...;
/// unsigned int valid_items = 250;
/// float output;
/// // execute min reduce
/// block_reduce_float().reduce(
/// input,
/// output,
/// valid_items,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
reduce
(
T
input
,
T
&
output
,
unsigned
int
valid_items
,
storage_type
&
storage
,
BinaryFunction
reduce_op
=
BinaryFunction
())
{
base_type
::
reduce
(
input
,
output
,
valid_items
,
storage
,
reduce_op
);
}
/// \overload
/// \brief Performs reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] valid_items - number of items that will be reduced in the block.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
reduce
(
T
input
,
T
&
output
,
unsigned
int
valid_items
,
BinaryFunction
reduce_op
=
BinaryFunction
())
{
base_type
::
reduce
(
input
,
output
,
valid_items
,
reduce_op
);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
3rdparty/cub/rocprim/block/block_scan.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
#define ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_scan_warp_scan.hpp"
#include "detail/block_scan_reduce_then_scan.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Available algorithms for block_scan primitive.
enum
class
block_scan_algorithm
{
/// \brief A warp_scan based algorithm.
using_warp_scan
,
/// \brief An algorithm which limits calculations to a single hardware warp.
reduce_then_scan
,
/// \brief Default block_scan algorithm.
default_algorithm
=
using_warp_scan
,
};
namespace
detail
{
// Selector for block_scan algorithm which gives block scan implementation
// type based on passed block_scan_algorithm enum
template
<
block_scan_algorithm
Algorithm
>
struct
select_block_scan_impl
;
template
<
>
struct
select_block_scan_impl
<
block_scan_algorithm
::
using_warp_scan
>
{
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
using
type
=
block_scan_warp_scan
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
;
};
template
<
>
struct
select_block_scan_impl
<
block_scan_algorithm
::
reduce_then_scan
>
{
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
// When BlockSize is less than hardware warp size block_scan_warp_scan performs better than
// block_scan_reduce_then_scan by specializing for warps
using
type
=
typename
std
::
conditional
<
(
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
<=
::
rocprim
::
device_warp_size
()),
block_scan_warp_scan
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
,
block_scan_reduce_then_scan
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
>::
type
;
};
}
// end namespace detail
/// \brief The block_scan class is a block level parallel primitive which provides methods
/// for performing inclusive and exclusive scan operations of items partitioned across
/// threads in a block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSizeX - the number of threads in a block's x dimension.
/// \tparam Algorithm - selected scan algorithm, block_scan_algorithm::default_algorithm by default.
/// \tparam BlockSizeY - the number of threads in a block's y dimension, defaults to 1.
/// \tparam BlockSizeZ - the number of threads in a block's z dimension, defaults to 1.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Computation can more efficient when:
/// * \p ItemsPerThread is greater than one,
/// * \p T is an arithmetic type,
/// * scan operation is simple addition operator, and
/// * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
/// * block_scan has two alternative implementations: \p block_scan_algorithm::using_warp_scan
/// and block_scan_algorithm::reduce_then_scan.
///
/// \par Examples
/// \parblock
/// In the examples scan operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize warp_scan for int and logical warp of 192 threads
/// using block_scan_int = rocprim::block_scan<int, 192>;
/// // allocate storage in shared memory
/// __shared__ block_scan_int::storage_type storage;
///
/// int value = ...;
/// // execute inclusive scan
/// block_scan_int().inclusive_scan(
/// value, // input
/// value, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
block_scan_algorithm
Algorithm
=
block_scan_algorithm
::
default_algorithm
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_scan
#ifndef DOXYGEN_SHOULD_SKIP_THIS
:
private
detail
::
select_block_scan_impl
<
Algorithm
>::
template
type
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
#endif
{
using
base_type
=
typename
detail
::
select_block_scan_impl
<
Algorithm
>::
template
type
<
T
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using
storage_type
=
typename
base_type
::
storage_type
;
/// \brief Performs inclusive scan across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive min scan operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for float and block of 256 threads
/// using block_scan_f = rocprim::block_scan<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_float::storage_type storage;
///
/// float input = ...;
/// float output;
/// // execute inclusive min scan
/// block_scan_float().inclusive_scan(
/// input,
/// output,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
/// \p output values in will be <tt>{1, -2, -2, -4, ..., -254, -256}</tt>.
/// \endparblock
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
inclusive_scan
(
T
input
,
T
&
output
,
storage_type
&
storage
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
base_type
::
inclusive_scan
(
input
,
output
,
storage
,
scan_op
);
}
/// \overload
/// \brief Performs inclusive scan across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
inclusive_scan
(
T
input
,
T
&
output
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
base_type
::
inclusive_scan
(
input
,
output
,
scan_op
);
}
/// \brief Performs inclusive scan and reduction across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive min scan operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for float and block of 256 threads
/// using block_scan_f = rocprim::block_scan<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_float::storage_type storage;
///
/// float input = ...;
/// float output;
/// float reduction;
/// // execute inclusive min scan
/// block_scan_float().inclusive_scan(
/// input,
/// output,
/// reduction,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
/// \p output values in will be <tt>{1, -2, -2, -4, ..., -254, -256}</tt>, and the \p reduction will
/// be <tt>-256</tt>.
/// \endparblock
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
inclusive_scan
(
T
input
,
T
&
output
,
T
&
reduction
,
storage_type
&
storage
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
base_type
::
inclusive_scan
(
input
,
output
,
reduction
,
storage
,
scan_op
);
}
/// \overload
/// \brief Performs inclusive scan and reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
inclusive_scan
(
T
input
,
T
&
output
,
T
&
reduction
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
base_type
::
inclusive_scan
(
input
,
output
,
reduction
,
scan_op
);
}
/// \brief Performs inclusive scan across threads in a block, and uses
/// \p prefix_callback_op to generate prefix value for the whole block.
///
/// \tparam PrefixCallback - type of the unary function object used for generating
/// block-wide prefix value for the scan operation.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in,out] prefix_callback_op - function object for generating block prefix value.
/// The signature of the \p prefix_callback_op should be equivalent to the following:
/// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The object will be called by the first warp of the block with block reduction of
/// \p input values as input argument. The result of the first thread will be used as the
/// block-wide prefix.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive prefix sum operations performed on a block of 256 threads,
/// each thread provides one \p int value.
///
/// \code{.cpp}
///
/// struct my_block_prefix
/// {
/// int prefix;
///
/// __device__ my_block_prefix(int prefix) : prefix(prefix) {}
///
/// __device__ int operator()(int block_reduction)
/// {
/// int old_prefix = prefix;
/// prefix = prefix + block_reduction;
/// return old_prefix;
/// }
/// };
///
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for int and block of 256 threads
/// using block_scan_f = rocprim::block_scan<int, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_int::storage_type storage;
///
/// // init prefix functor
/// my_block_prefix prefix_callback(10);
///
/// int input;
/// int output;
/// // execute inclusive prefix sum
/// block_scan_int().inclusive_scan(
/// input,
/// output,
/// storage,
/// prefix_callback,
/// rocprim::plus<int>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
/// \p output values in will be <tt>{11, 12, 13, ..., 266}</tt>, and the \p prefix will
/// be <tt>266</tt>.
/// \endparblock
template
<
class
PrefixCallback
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
inclusive_scan
(
T
input
,
T
&
output
,
storage_type
&
storage
,
PrefixCallback
&
prefix_callback_op
,
BinaryFunction
scan_op
)
{
base_type
::
inclusive_scan
(
input
,
output
,
storage
,
prefix_callback_op
,
scan_op
);
}
/// \brief Performs inclusive scan across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive maximum scan operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for long and block of 128 threads
/// using block_scan_f = rocprim::block_scan<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_long::storage_type storage;
///
/// long input[2] = ...;
/// long output[2];
/// // execute inclusive min scan
/// block_scan_long().inclusive_scan(
/// input,
/// output,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
/// \p output values in will be <tt>{-1, 2, 2, 4, ..., 254, 256}</tt>.
/// \endparblock
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
inclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
storage_type
&
storage
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
inclusive_scan
(
input
[
0
],
output
[
0
],
storage
,
scan_op
);
}
else
{
base_type
::
inclusive_scan
(
input
,
output
,
storage
,
scan_op
);
}
}
/// \overload
/// \brief Performs inclusive scan across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
inclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
BinaryFunction
scan_op
=
BinaryFunction
())
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
inclusive_scan
(
input
[
0
],
output
[
0
],
scan_op
);
}
else
{
base_type
::
inclusive_scan
(
input
,
output
,
scan_op
);
}
}
/// \brief Performs inclusive scan and reduction across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive maximum scan operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for long and block of 128 threads
/// using block_scan_f = rocprim::block_scan<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_long::storage_type storage;
///
/// long input[2] = ...;
/// long output[2];
/// long reduction;
/// // execute inclusive min scan
/// block_scan_long().inclusive_scan(
/// input,
/// output,
/// reduction,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
/// \p output values in will be <tt>{-1, 2, 2, 4, ..., 254, 256}</tt> and the \p reduction will be \p 256.
/// \endparblock
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
inclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
T
&
reduction
,
storage_type
&
storage
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
inclusive_scan
(
input
[
0
],
output
[
0
],
reduction
,
storage
,
scan_op
);
}
else
{
base_type
::
inclusive_scan
(
input
,
output
,
reduction
,
storage
,
scan_op
);
}
}
/// \overload
/// \brief Performs inclusive scan and reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
inclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
T
&
reduction
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
inclusive_scan
(
input
[
0
],
output
[
0
],
reduction
,
scan_op
);
}
else
{
base_type
::
inclusive_scan
(
input
,
output
,
reduction
,
scan_op
);
}
}
/// \brief Performs inclusive scan across threads in a block, and uses
/// \p prefix_callback_op to generate prefix value for the whole block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam PrefixCallback - type of the unary function object used for generating
/// block-wide prefix value for the scan operation.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in,out] prefix_callback_op - function object for generating block prefix value.
/// The signature of the \p prefix_callback_op should be equivalent to the following:
/// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The object will be called by the first warp of the block with block reduction of
/// \p input values as input argument. The result of the first thread will be used as the
/// block-wide prefix.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive prefix sum operations performed on a block of 128 threads,
/// each thread provides two \p int value.
///
/// \code{.cpp}
///
/// struct my_block_prefix
/// {
/// int prefix;
///
/// __device__ my_block_prefix(int prefix) : prefix(prefix) {}
///
/// __device__ int operator()(int block_reduction)
/// {
/// int old_prefix = prefix;
/// prefix = prefix + block_reduction;
/// return old_prefix;
/// }
/// };
///
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for int and block of 128 threads
/// using block_scan_f = rocprim::block_scan<int, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_int::storage_type storage;
///
/// // init prefix functor
/// my_block_prefix prefix_callback(10);
///
/// int input[2] = ...;
/// int output[2];
/// // execute inclusive prefix sum
/// block_scan_int().inclusive_scan(
/// input,
/// output,
/// storage,
/// prefix_callback,
/// rocprim::plus<int>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
/// \p output values in will be <tt>{11, 12, 13, ..., 266}</tt>, and the \p prefix will
/// be <tt>266</tt>.
/// \endparblock
template
<
unsigned
int
ItemsPerThread
,
class
PrefixCallback
,
class
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
inclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
storage_type
&
storage
,
PrefixCallback
&
prefix_callback_op
,
BinaryFunction
scan_op
)
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
inclusive_scan
(
input
[
0
],
output
[
0
],
storage
,
prefix_callback_op
,
scan_op
);
}
else
{
base_type
::
inclusive_scan
(
input
,
output
,
storage
,
prefix_callback_op
,
scan_op
);
}
}
/// \brief Performs exclusive scan across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive min scan operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for float and block of 256 threads
/// using block_scan_f = rocprim::block_scan<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_float::storage_type storage;
///
/// float init = ...;
/// float input = ...;
/// float output;
/// // execute exclusive min scan
/// block_scan_float().exclusive_scan(
/// input,
/// output,
/// init,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>
/// and \p init is \p 0, then \p output values in will be <tt>{0, 0, -2, -2, -4, ..., -254, -254}</tt>.
/// \endparblock
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exclusive_scan
(
T
input
,
T
&
output
,
T
init
,
storage_type
&
storage
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
base_type
::
exclusive_scan
(
input
,
output
,
init
,
storage
,
scan_op
);
}
/// \overload
/// \brief Performs exclusive scan across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
exclusive_scan
(
T
input
,
T
&
output
,
T
init
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
base_type
::
exclusive_scan
(
input
,
output
,
init
,
scan_op
);
}
/// \brief Performs exclusive scan and reduction across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive min scan operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for float and block of 256 threads
/// using block_scan_f = rocprim::block_scan<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_float::storage_type storage;
///
/// float init = 0;
/// float input = ...;
/// float output;
/// float reduction;
/// // execute exclusive min scan
/// block_scan_float().exclusive_scan(
/// input,
/// output,
/// init,
/// reduction,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>
/// and \p init is \p 0, then \p output values in will be <tt>{0, 0, -2, -2, -4, ..., -254, -254}</tt>
/// and the \p reduction will be \p -256.
/// \endparblock
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exclusive_scan
(
T
input
,
T
&
output
,
T
init
,
T
&
reduction
,
storage_type
&
storage
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
base_type
::
exclusive_scan
(
input
,
output
,
init
,
reduction
,
storage
,
scan_op
);
}
/// \overload
/// \brief Performs exclusive scan and reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
exclusive_scan
(
T
input
,
T
&
output
,
T
init
,
T
&
reduction
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
base_type
::
exclusive_scan
(
input
,
output
,
init
,
reduction
,
scan_op
);
}
/// \brief Performs exclusive scan across threads in a block, and uses
/// \p prefix_callback_op to generate prefix value for the whole block.
///
/// \tparam PrefixCallback - type of the unary function object used for generating
/// block-wide prefix value for the scan operation.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in,out] prefix_callback_op - function object for generating block prefix value.
/// The signature of the \p prefix_callback_op should be equivalent to the following:
/// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The object will be called by the first warp of the block with block reduction of
/// \p input values as input argument. The result of the first thread will be used as the
/// block-wide prefix.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive prefix sum operations performed on a block of 256 threads,
/// each thread provides one \p int value.
///
/// \code{.cpp}
///
/// struct my_block_prefix
/// {
/// int prefix;
///
/// __device__ my_block_prefix(int prefix) : prefix(prefix) {}
///
/// __device__ int operator()(int block_reduction)
/// {
/// int old_prefix = prefix;
/// prefix = prefix + block_reduction;
/// return old_prefix;
/// }
/// };
///
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for int and block of 256 threads
/// using block_scan_f = rocprim::block_scan<int, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_int::storage_type storage;
///
/// // init prefix functor
/// my_block_prefix prefix_callback(10);
///
/// int input;
/// int output;
/// // execute exclusive prefix sum
/// block_scan_int().exclusive_scan(
/// input,
/// output,
/// storage,
/// prefix_callback,
/// rocprim::plus<int>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
/// \p output values in will be <tt>{10, 11, 12, 13, ..., 265}</tt>, and the \p prefix will
/// be <tt>266</tt>.
/// \endparblock
template
<
class
PrefixCallback
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exclusive_scan
(
T
input
,
T
&
output
,
storage_type
&
storage
,
PrefixCallback
&
prefix_callback_op
,
BinaryFunction
scan_op
)
{
base_type
::
exclusive_scan
(
input
,
output
,
storage
,
prefix_callback_op
,
scan_op
);
}
/// \brief Performs exclusive scan across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive maximum scan operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for long and block of 128 threads
/// using block_scan_f = rocprim::block_scan<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_long::storage_type storage;
///
/// long init = ...;
/// long input[2] = ...;
/// long output[2];
/// // execute exclusive min scan
/// block_scan_long().exclusive_scan(
/// input,
/// output,
/// init,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>
/// and \p init is 0, then \p output values in will be <tt>{0, 0, 2, 2, 4, ..., 254, 254}</tt>.
/// \endparblock
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
T
init
,
storage_type
&
storage
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
exclusive_scan
(
input
[
0
],
output
[
0
],
init
,
storage
,
scan_op
);
}
else
{
base_type
::
exclusive_scan
(
input
,
output
,
init
,
storage
,
scan_op
);
}
}
/// \overload
/// \brief Performs exclusive scan across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
exclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
T
init
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
exclusive_scan
(
input
[
0
],
output
[
0
],
init
,
scan_op
);
}
else
{
base_type
::
exclusive_scan
(
input
,
output
,
init
,
scan_op
);
}
}
/// \brief Performs exclusive scan and reduction across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive maximum scan operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for long and block of 128 threads
/// using block_scan_f = rocprim::block_scan<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_long::storage_type storage;
///
/// long init = ...;
/// long input[2] = ...;
/// long output[2];
/// long reduction;
/// // execute exclusive min scan
/// block_scan_long().exclusive_scan(
/// input,
/// output,
/// init,
/// reduction,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>
/// and \p init is 0, then \p output values in will be <tt>{0, 0, 2, 2, 4, ..., 254, 254}</tt>
/// and the \p reduction will be \p 256.
/// \endparblock
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
T
init
,
T
&
reduction
,
storage_type
&
storage
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
exclusive_scan
(
input
[
0
],
output
[
0
],
init
,
reduction
,
storage
,
scan_op
);
}
else
{
base_type
::
exclusive_scan
(
input
,
output
,
init
,
reduction
,
storage
,
scan_op
);
}
}
/// \overload
/// \brief Performs exclusive scan and reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
unsigned
int
ItemsPerThread
,
class
BinaryFunction
=
::
rocprim
::
plus
<
T
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
exclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
T
init
,
T
&
reduction
,
BinaryFunction
scan_op
=
BinaryFunction
())
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
exclusive_scan
(
input
[
0
],
output
[
0
],
init
,
reduction
,
scan_op
);
}
else
{
base_type
::
exclusive_scan
(
input
,
output
,
init
,
reduction
,
scan_op
);
}
}
/// \brief Performs exclusive scan across threads in a block, and uses
/// \p prefix_callback_op to generate prefix value for the whole block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam PrefixCallback - type of the unary function object used for generating
/// block-wide prefix value for the scan operation.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in,out] prefix_callback_op - function object for generating block prefix value.
/// The signature of the \p prefix_callback_op should be equivalent to the following:
/// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The object will be called by the first warp of the block with block reduction of
/// \p input values as input argument. The result of the first thread will be used as the
/// block-wide prefix.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive prefix sum operations performed on a block of 128 threads,
/// each thread provides two \p int value.
///
/// \code{.cpp}
///
/// struct my_block_prefix
/// {
/// int prefix;
///
/// __device__ my_block_prefix(int prefix) : prefix(prefix) {}
///
/// __device__ int operator()(int block_reduction)
/// {
/// int old_prefix = prefix;
/// prefix = prefix + block_reduction;
/// return old_prefix;
/// }
/// };
///
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for int and block of 128 threads
/// using block_scan_f = rocprim::block_scan<int, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_int::storage_type storage;
///
/// // init prefix functor
/// my_block_prefix prefix_callback(10);
///
/// int input[2] = ...;
/// int output[2];
/// // execute exclusive prefix sum
/// block_scan_int().exclusive_scan(
/// input,
/// output,
/// storage,
/// prefix_callback,
/// rocprim::plus<int>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
/// \p output values in will be <tt>{10, 11, 12, 13, ..., 265}</tt>, and the \p prefix will
/// be <tt>266</tt>.
/// \endparblock
template
<
unsigned
int
ItemsPerThread
,
class
PrefixCallback
,
class
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
exclusive_scan
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
output
)[
ItemsPerThread
],
storage_type
&
storage
,
PrefixCallback
&
prefix_callback_op
,
BinaryFunction
scan_op
)
{
if
(
ItemsPerThread
==
1
)
{
base_type
::
exclusive_scan
(
input
[
0
],
output
[
0
],
storage
,
prefix_callback_op
,
scan_op
);
}
else
{
base_type
::
exclusive_scan
(
input
,
output
,
storage
,
prefix_callback_op
,
scan_op
);
}
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
3rdparty/cub/rocprim/block/block_shuffle.hpp
0 → 100644
View file @
f8a481f8
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
#define ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_reduce_warp_reduce.hpp"
#include "detail/block_reduce_raking_reduce.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The block_shuffle class is a block level parallel primitive which provides methods
/// for shuffling data partitioned across a block
///
/// \tparam T - the input/output type.
/// \tparam BlockSizeX - the number of threads in a block's x dimension, it has no defaults value.
/// \tparam BlockSizeY - the number of threads in a block's y dimension, defaults to 1.
/// \tparam BlockSizeZ - the number of threads in a block's z dimension, defaults to 1.
///
/// \par Overview
/// It is commonplace for blocks of threads to rearrange data items between
/// threads. The BlockShuffle abstraction allows threads to efficiently shift items
/// either (a) up to their successor or (b) down to their predecessor.
/// * Computation can more efficient when:
/// * \p ItemsPerThread is greater than one,
/// * \p T is an arithmetic type,
/// * the number of threads in the block is a multiple of the hardware warp size (see rocprim::warp_size()).
///
/// \par Examples
/// \parblock
/// In the examples shuffle operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
/// // allocate storage in shared memory
/// __shared__ block_shuffle::storage_type storage;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().inclusive_up(
/// value, // input
/// value, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_shuffle
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct
storage_type_
{
T
prev
[
BlockSize
];
T
next
[
BlockSize
];
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
detail
::
raw_storage
<
storage_type_
>
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
/// \brief Shuffles data across threads in a block, offseted by the distance value.
///
/// \par A thread with threadId i receives data from a thread with threadIdx (i-distance), whre distance may be a negative value.
/// allocated by the method itself.
/// \par Any shuffle operation with invalid input or output threadIds are not carried out, i.e. threadId < 0 || threadId >= BlockSize.
///
/// \param [in] input - input data to be shuffled to another thread.
/// \param [out] output - reference to a output value, that receives data from another thread
/// \param [in] distance - The input threadId + distance = output threadId.
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().offset(
/// value, // input
/// value // output
/// );
/// ...
/// }
/// \endcode
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
offset
(
T
input
,
T
&
output
,
int
distance
=
1
)
{
offset
(
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
(),
input
,
output
,
distance
);
}
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
offset
(
const
size_t
&
flat_id
,
T
input
,
T
&
output
,
int
distance
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
offset
(
flat_id
,
input
,
output
,
distance
,
storage
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
offset
(
const
size_t
&
flat_id
,
T
input
,
T
&
output
,
int
distance
,
storage_type
&
storage
)
{
storage_type_
&
storage_
=
storage
.
get
();
storage_
.
prev
[
flat_id
]
=
input
;
::
rocprim
::
syncthreads
();
const
int
offset_tid
=
static_cast
<
int
>
(
flat_id
)
+
distance
;
if
((
offset_tid
>=
0
)
&&
(
offset_tid
<
(
int
)
BlockSize
))
{
output
=
storage_
.
prev
[
static_cast
<
size_t
>
(
offset_tid
)];
}
}
/// \brief Shuffles data across threads in a block, offseted by the distance value.
///
/// \par A thread with threadId i receives data from a thread with threadIdx (i-distance)%BlockSize, whre distance may be a negative value.
/// allocated by the method itself.
/// \par Data is rotated around the block, using (input_threadId + distance) modulous BlockSize to ensure valid threadIds.
///
/// \param [in] input - input data to be shuffled to another thread.
/// \param [out] output - reference to a output value, that receives data from another thread
/// \param [in] distance - The input threadId + distance = output threadId.
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().rotate(
/// value, // input
/// value // output
/// );
/// ...
/// }
/// \endcode
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
rotate
(
T
input
,
T
&
output
,
unsigned
int
distance
=
1
)
{
rotate
(
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
(),
input
,
output
,
distance
);
}
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
rotate
(
const
size_t
&
flat_id
,
T
input
,
T
&
output
,
unsigned
int
distance
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
rotate
(
flat_id
,
input
,
output
,
distance
,
storage
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
rotate
(
const
size_t
&
flat_id
,
T
input
,
T
&
output
,
unsigned
int
distance
,
storage_type
&
storage
)
{
storage_type_
&
storage_
=
storage
.
get
();
storage_
.
prev
[
flat_id
]
=
input
;
::
rocprim
::
syncthreads
();
unsigned
int
offset
=
threadIdx
.
x
+
distance
;
if
(
offset
>=
BlockSize
)
offset
-=
BlockSize
;
output
=
storage_
.
prev
[
offset
];
}
/// \brief The thread block rotates a blocked arrange of input items,
/// shifting it up by one item
///
/// \param [in] input - The calling thread's input items
/// \param [out] prev - The corresponding predecessor items (may be aliased to \p input).
/// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().up(
/// value, // input
/// value // output
/// );
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
up
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
prev
)[
ItemsPerThread
])
{
this
->
up
(
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
(),
input
,
prev
);
}
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
up
(
const
size_t
&
flat_id
,
T
(
&
input
)[
ItemsPerThread
],
T
(
&
prev
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
this
->
up
(
flat_id
,
input
,
prev
,
storage
);
}
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
up
(
const
size_t
&
flat_id
,
T
(
&
input
)[
ItemsPerThread
],
T
(
&
prev
)[
ItemsPerThread
],
storage_type
&
storage
)
{
storage_type_
&
storage_
=
storage
.
get
();
storage_
.
prev
[
flat_id
]
=
input
[
ItemsPerThread
-
1
];
::
rocprim
::
syncthreads
();
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
ItemsPerThread
-
1
;
i
>
0
;
--
i
)
{
prev
[
i
]
=
input
[
i
-
1
];
}
if
(
flat_id
>
0
)
{
prev
[
0
]
=
storage_
.
prev
[
flat_id
-
1
];
}
}
/// \brief The thread block rotates a blocked arrange of input items,
/// shifting it up by one item
///
/// \param [in] input - The calling thread's input items
/// \param [out] prev - The corresponding predecessor items (may be aliased to \p input).
/// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
/// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from
/// <em>thread</em><sub><tt>BlockSize-1</tt></sub>, provided to all threads
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
up
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
prev
)[
ItemsPerThread
],
T
&
block_suffix
)
{
this
->
up
(
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
(),
input
,
prev
,
block_suffix
);
}
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
up
(
const
size_t
&
flat_id
,
T
(
&
input
)[
ItemsPerThread
],
T
(
&
prev
)[
ItemsPerThread
],
T
&
block_suffix
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
this
->
up
(
flat_id
,
input
,
prev
,
block_suffix
,
storage
);
}
template
<
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
up
(
const
size_t
&
flat_id
,
T
(
&
input
)[
ItemsPerThread
],
T
(
&
prev
)[
ItemsPerThread
],
T
&
block_suffix
,
storage_type
&
storage
)
{
up
(
flat_id
,
input
,
prev
,
storage
);
// Update block prefix
block_suffix
=
storage
->
prev
[
BlockSize
-
1
];
}
/// \brief The thread block rotates a blocked arrange of input items,
/// shifting it down by one item
///
/// \param [in] input - The calling thread's input items
/// \param [out] next - The corresponding successor items (may be aliased to \p input).
/// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().down(
/// value, // input
/// value // output
/// );
/// ...
/// }
/// \endcode
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
down
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
next
)[
ItemsPerThread
])
{
this
->
down
(
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
(),
input
,
next
);
}
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
down
(
const
size_t
&
flat_id
,
T
(
&
input
)[
ItemsPerThread
],
T
(
&
next
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
this
->
down
(
flat_id
,
input
,
next
,
storage
);
}
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
down
(
const
size_t
&
flat_id
,
T
(
&
input
)[
ItemsPerThread
],
T
(
&
next
)[
ItemsPerThread
],
storage_type
&
storage
)
{
storage_type_
&
storage_
=
storage
.
get
();
storage_
.
next
[
flat_id
]
=
input
[
0
];
::
rocprim
::
syncthreads
();
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
(
ItemsPerThread
-
1
);
++
i
)
{
next
[
i
]
=
input
[
i
+
1
];
}
if
(
flat_id
<
(
BlockSize
-
1
))
{
next
[
ItemsPerThread
-
1
]
=
storage_
.
next
[
flat_id
+
1
];
}
}
/// \brief The thread block rotates a blocked arrange of input items,
/// shifting it down by one item
///
/// \param [in] input - The calling thread's input items
/// \param [out] next - The corresponding successor items (may be aliased to \p input).
/// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
/// \param [out] block_prefix - The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
down
(
T
(
&
input
)[
ItemsPerThread
],
T
(
&
next
)[
ItemsPerThread
],
T
&
block_prefix
)
{
this
->
down
(
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
(),
input
,
next
,
block_prefix
);
}
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
down
(
const
size_t
&
flat_id
,
T
(
&
input
)[
ItemsPerThread
],
T
(
&
next
)[
ItemsPerThread
],
T
&
block_prefix
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
this
->
down
(
flat_id
,
input
,
next
,
block_prefix
,
storage
);
}
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
down
(
const
size_t
&
flat_id
,
T
(
&
input
)[
ItemsPerThread
],
T
(
&
next
)[
ItemsPerThread
],
T
&
block_prefix
,
storage_type
&
storage
)
{
this
->
down
(
flat_id
,
input
,
next
,
storage
);
// Update block prefixstorage_->
block_prefix
=
storage
->
next
[
0
];
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
3rdparty/cub/rocprim/block/block_sort.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_SORT_HPP_
#define ROCPRIM_BLOCK_BLOCK_SORT_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_sort_bitonic.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Available algorithms for block_sort primitive.
enum
class
block_sort_algorithm
{
/// \brief A bitonic sort based algorithm.
bitonic_sort
,
/// \brief Default block_sort algorithm.
default_algorithm
=
bitonic_sort
,
};
namespace
detail
{
// Selector for block_sort algorithm which gives block sort implementation
// type based on passed block_sort_algorithm enum
template
<
block_sort_algorithm
Algorithm
>
struct
select_block_sort_impl
;
template
<
>
struct
select_block_sort_impl
<
block_sort_algorithm
::
bitonic_sort
>
{
template
<
class
Key
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
,
unsigned
int
ItemsPerThread
,
class
Value
>
using
type
=
block_sort_bitonic
<
Key
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
,
ItemsPerThread
,
Value
>
;
};
}
// end namespace detail
/// \brief The block_sort class is a block level parallel primitive which provides
/// methods sorting items (keys or key-value pairs) partitioned across threads in a block
/// using comparison-based sort algorithm.
///
/// \tparam Key - the key type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - number of items processed by each thread.
/// The total range will be BlockSize * ItemsPerThread long
/// \tparam Value - the value type. Default type empty_type indicates
/// a keys-only sort.
/// \tparam Algorithm - selected sort algorithm, block_sort_algorithm::default_algorithm by default.
///
/// \par Overview
/// * Accepts custom compare_functions for sorting across a block.
/// * Performance depends on \p BlockSize.
/// * It is better if \p BlockSize is a power of two.
/// * If \p BlockSize is not a power of two, or when function with \p size overload is used
/// odd-even sort is used instead of bitonic sort, leading to decreased performance.
///
/// \par Examples
/// \parblock
/// In the examples sort is performed on a block of 256 threads, each thread provides
/// one \p int value, results are returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_sort for int, block of 256 threads,
/// // key-only sort
/// using block_sort_int = rocprim::block_sort<int, 256>;
/// // allocate storage in shared memory
/// __shared__ block_sort_int::storage_type storage;
///
/// int input = ...;
/// // execute block sort (ascending)
/// block_sort_int().sort(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
Key
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
=
1
,
class
Value
=
empty_type
,
block_sort_algorithm
Algorithm
=
block_sort_algorithm
::
default_algorithm
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_sort
#ifndef DOXYGEN_SHOULD_SKIP_THIS
:
private
detail
::
select_block_sort_impl
<
Algorithm
>::
template
type
<
Key
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
,
ItemsPerThread
,
Value
>
#endif
{
using
base_type
=
typename
detail
::
select_block_sort_impl
<
Algorithm
>::
template
type
<
Key
,
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
,
ItemsPerThread
,
Value
>;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using
storage_type
=
typename
base_type
::
storage_type
;
/// \brief Block sort for any data type.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort
(
Key
&
thread_key
,
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_key
,
compare_function
);
}
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort
(
Key
(
&
thread_keys
)[
ItemsPerThread
],
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_keys
,
compare_function
);
}
/// \brief Block sort for any data type.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples sort is performed on a block of 256 threads, each thread provides
/// one \p int value, results are returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_sort for int, block of 256 threads,
/// // key-only sort
/// using block_sort_int = rocprim::block_sort<int, 256>;
/// // allocate storage in shared memory
/// __shared__ block_sort_int::storage_type storage;
///
/// int input = ...;
/// // execute block sort (ascending)
/// block_sort_int().sort(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort
(
Key
&
thread_key
,
storage_type
&
storage
,
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_key
,
storage
,
compare_function
);
}
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort
(
Key
(
&
thread_keys
)[
ItemsPerThread
],
storage_type
&
storage
,
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_keys
,
storage
,
compare_function
);
}
/// \brief Block sort by key for any data type.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in, out] thread_value - reference to a value provided by a thread.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort
(
Key
&
thread_key
,
Value
&
thread_value
,
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_key
,
thread_value
,
compare_function
);
}
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort
(
Key
(
&
thread_keys
)[
ItemsPerThread
],
Value
(
&
thread_values
)[
ItemsPerThread
],
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_keys
,
thread_values
,
compare_function
);
}
/// \brief Block sort by key for any data type.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in, out] thread_value - reference to a value provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \parblock
/// In the examples sort is performed on a block of 256 threads, each thread provides
/// one \p int key and one \p int value, results are returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_sort for int, block of 256 threads,
/// using block_sort_int = rocprim::block_sort<int, 256, int>;
/// // allocate storage in shared memory
/// __shared__ block_sort_int::storage_type storage;
///
/// int key = ...;
/// int value = ...;
/// // execute block sort (ascending)
/// block_sort_int().sort(
/// key,
/// value,
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort
(
Key
&
thread_key
,
Value
&
thread_value
,
storage_type
&
storage
,
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_key
,
thread_value
,
storage
,
compare_function
);
}
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort
(
Key
(
&
thread_keys
)[
ItemsPerThread
],
Value
(
&
thread_values
)[
ItemsPerThread
],
storage_type
&
storage
,
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_keys
,
thread_values
,
storage
,
compare_function
);
}
/// \brief Block sort by key for any data type. If \p size is
/// greater than \p BlockSize, this function does nothing.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] size - custom size of block to be sorted.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
sort
(
Key
&
thread_key
,
storage_type
&
storage
,
const
unsigned
int
size
,
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_key
,
storage
,
size
,
compare_function
);
}
/// \brief Block sort by key for any data type. If \p size is
/// greater than \p BlockSize, this function does nothing.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in, out] thread_value - reference to a value provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] size - custom size of block to be sorted.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template
<
class
BinaryFunction
=
::
rocprim
::
less
<
Key
>
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
sort
(
Key
&
thread_key
,
Value
&
thread_value
,
storage_type
&
storage
,
const
unsigned
int
size
,
BinaryFunction
compare_function
=
BinaryFunction
())
{
base_type
::
sort
(
thread_key
,
thread_value
,
storage
,
size
,
compare_function
);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_SORT_HPP_
3rdparty/cub/rocprim/block/block_store.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_STORE_HPP_
#define ROCPRIM_BLOCK_BLOCK_STORE_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "block_store_func.hpp"
#include "block_exchange.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief \p block_store_method enumerates the methods available to store a striped arrangement
/// of items into a blocked/striped arrangement on continuous memory
enum
class
block_store_method
{
/// A blocked arrangement of items is stored into a blocked arrangement on continuous
/// memory.
/// \par Performance Notes:
/// * Performance decreases with increasing number of items per thread (stride
/// between reads), because of reduced memory coalescing.
block_store_direct
,
/// A striped arrangement of items is stored into a blocked arrangement on continuous
/// memory.
block_store_striped
,
/// A blocked arrangement of items is stored into a blocked arrangement on continuous
/// memory using vectorization as an optimization.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, provided that
/// vectorization requirements are fulfilled. Otherwise, performance will default
/// to \p block_store_direct.
/// \par Requirements:
/// * The output offset (\p block_output) must be quad-item aligned.
/// * The following conditions will prevent vectorization and switch to default
/// \p block_store_direct:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
block_store_vectorize
,
/// A blocked arrangement of items is locally transposed and stored as a striped
/// arrangement of data on continuous memory.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_store_direct and
/// \p block_store_vectorize due to reordering on local memory.
block_store_transpose
,
/// A blocked arrangement of items is locally transposed and stored as a warp-striped
/// arrangement of data on continuous memory.
/// \par Requirements:
/// * The number of threads in the block must be a multiple of the size of hardware warp.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_store_direct and
/// \p block_store_vectorize due to reordering on local memory.
block_store_warp_transpose
,
/// Defaults to \p block_store_direct
default_method
=
block_store_direct
};
/// \brief The \p block_store class is a block level parallel primitive which provides methods
/// for storing an arrangement of items into a blocked/striped arrangement on continous memory.
///
/// \tparam T - the output/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items to be processed by
/// each thread.
/// \tparam Method - the method to store data.
///
/// \par Overview
/// * The \p block_store class has a number of different methods to store data:
/// * [block_store_direct](\ref ::block_store_method::block_store_direct)
/// * [block_store_striped](\ref ::block_store_method::block_store_striped)
/// * [block_store_vectorize](\ref ::block_store_method::block_store_vectorize)
/// * [block_store_transpose](\ref ::block_store_method::block_store_transpose)
/// * [block_store_warp_transpose](\ref ::block_store_method::block_store_warp_transpose)
///
/// \par Example:
/// \parblock
/// In the examples store operation is performed on block of 128 threads, using type
/// \p int and 8 items per thread.
///
/// \code{.cpp}
/// __global__ void kernel(int * output)
/// {
/// const int offset = blockIdx.x * 128 * 8;
/// int items[8];
/// rocprim::block_store<int, 128, 8, store_method> blockstore;
/// blockstore.store(output + offset, items);
/// ...
/// }
/// \endcode
/// \endparblock
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
block_store_method
Method
=
block_store_method
::
block_store_direct
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_store
{
private:
using
storage_type_
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords \p __shared__. It can be aliased to
/// an externally allocated memory, or be a part of a union with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
/// \brief Stores an arrangement of items from across the thread block into an
/// arrangement on continuous memory.
///
/// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
/// pointer.
///
/// \param [out] block_output - the output iterator from the thread block to store to.
/// \param [in] items - array that data is read from.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
])
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_blocked
(
flat_id
,
block_output
,
items
);
}
/// \brief Stores an arrangement of items from across the thread block into an
/// arrangement on continuous memory, which is guarded by range \p valid.
///
/// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
/// pointer.
///
/// \param [out] block_output - the output iterator from the thread block to store to.
/// \param [in] items - array that data is read from.
/// \param [in] valid - maximum range of valid numbers to read.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_blocked
(
flat_id
,
block_output
,
items
,
valid
);
}
/// \brief Stores an arrangement of items from across the thread block into an
/// arrangement on continuous memory, using temporary storage.
///
/// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
/// pointer.
///
/// \param [out] block_output - the output iterator from the thread block to store to.
/// \param [in] items - array that data is read from.
/// \param [in] storage - temporary storage for outputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void kernel(...)
/// {
/// int items[8];
/// using block_store_int = rocprim::block_store<int, 128, 8>;
/// block_store_int bstore;
/// __shared__ typename block_store_int::storage_type storage;
/// bstore.store(..., items, storage);
/// ...
/// }
/// \endcode
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
(
void
)
storage
;
store
(
block_output
,
items
);
}
/// \brief Stores an arrangement of items from across the thread block into an
/// arrangement on continuous memory, which is guarded by range \p valid,
/// using temporary storage
///
/// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
/// pointer.
///
/// \param [out] block_output - the output iterator from the thread block to store to.
/// \param [in] items - array that data is read from.
/// \param [in] valid - maximum range of valid numbers to read.
/// \param [in] storage - temporary storage for outputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void kernel(...)
/// {
/// int items[8];
/// using block_store_int = rocprim::block_store<int, 128, 8>;
/// block_store_int bstore;
/// __shared__ typename block_store_int::storage_type storage;
/// bstore.store(..., items, valid, storage);
/// ...
/// }
/// \endcode
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
(
void
)
storage
;
store
(
block_output
,
items
,
valid
);
}
};
/// @}
// end of group blockmodule
#ifndef DOXYGEN_SHOULD_SKIP_THIS
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
class
block_store
<
T
,
BlockSizeX
,
ItemsPerThread
,
block_store_method
::
block_store_striped
,
BlockSizeY
,
BlockSizeZ
>
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
private:
using
storage_type_
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
inline
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
])
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_striped
<
BlockSize
>
(
flat_id
,
block_output
,
items
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
inline
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_striped
<
BlockSize
>
(
flat_id
,
block_output
,
items
,
valid
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
inline
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
(
void
)
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_striped
<
BlockSize
>
(
flat_id
,
block_output
,
items
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
inline
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
(
void
)
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_striped
<
BlockSize
>
(
flat_id
,
block_output
,
items
,
valid
);
}
};
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
class
block_store
<
T
,
BlockSizeX
,
ItemsPerThread
,
block_store_method
::
block_store_vectorize
,
BlockSizeY
,
BlockSizeZ
>
{
private:
using
storage_type_
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using
storage_type
=
typename
::
rocprim
::
detail
::
empty_storage_type
;
#else
using
storage_type
=
storage_type_
;
// only for Doxygen
#endif
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
T
*
block_output
,
T
(
&
_items
)[
ItemsPerThread
])
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_blocked_vectorized
(
flat_id
,
block_output
,
_items
);
}
template
<
class
OutputIterator
,
class
U
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
U
(
&
items
)[
ItemsPerThread
])
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_blocked
(
flat_id
,
block_output
,
items
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_store_direct_blocked
(
flat_id
,
block_output
,
items
,
valid
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
T
*
block_output
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
(
void
)
storage
;
store
(
block_output
,
items
);
}
template
<
class
OutputIterator
,
class
U
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
U
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
(
void
)
storage
;
store
(
block_output
,
items
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
(
void
)
storage
;
store
(
block_output
,
items
,
valid
);
}
};
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
class
block_store
<
T
,
BlockSizeX
,
ItemsPerThread
,
block_store_method
::
block_store_transpose
,
BlockSizeY
,
BlockSizeZ
>
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
private:
using
block_exchange_type
=
block_exchange
<
T
,
BlockSize
,
ItemsPerThread
>
;
public:
using
storage_type
=
typename
block_exchange_type
::
storage_type
;
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_exchange_type
().
blocked_to_striped
(
items
,
items
,
storage
);
block_store_direct_striped
<
BlockSize
>
(
flat_id
,
block_output
,
items
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_exchange_type
().
blocked_to_striped
(
items
,
items
,
storage
);
block_store_direct_striped
<
BlockSize
>
(
flat_id
,
block_output
,
items
,
valid
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_exchange_type
().
blocked_to_striped
(
items
,
items
,
storage
);
block_store_direct_striped
<
BlockSize
>
(
flat_id
,
block_output
,
items
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_exchange_type
().
blocked_to_striped
(
items
,
items
,
storage
);
block_store_direct_striped
<
BlockSize
>
(
flat_id
,
block_output
,
items
,
valid
);
}
};
template
<
class
T
,
unsigned
int
BlockSizeX
,
unsigned
int
ItemsPerThread
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
class
block_store
<
T
,
BlockSizeX
,
ItemsPerThread
,
block_store_method
::
block_store_warp_transpose
,
BlockSizeY
,
BlockSizeZ
>
{
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
private:
using
block_exchange_type
=
block_exchange
<
T
,
BlockSize
,
ItemsPerThread
>
;
public:
static_assert
(
BlockSize
%
::
rocprim
::
device_warp_size
()
==
0
,
"BlockSize must be a multiple of hardware warpsize"
);
using
storage_type
=
typename
block_exchange_type
::
storage_type
;
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
])
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_exchange_type
().
blocked_to_warp_striped
(
items
,
items
,
storage
);
block_store_direct_warp_striped
(
flat_id
,
block_output
,
items
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_FORCE_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
ROCPRIM_SHARED_MEMORY
storage_type
storage
;
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_exchange_type
().
blocked_to_warp_striped
(
items
,
items
,
storage
);
block_store_direct_warp_striped
(
flat_id
,
block_output
,
items
,
valid
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_exchange_type
().
blocked_to_warp_striped
(
items
,
items
,
storage
);
block_store_direct_warp_striped
(
flat_id
,
block_output
,
items
);
}
template
<
class
OutputIterator
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
store
(
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
,
storage_type
&
storage
)
{
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
block_exchange_type
().
blocked_to_warp_striped
(
items
,
items
,
storage
);
block_store_direct_warp_striped
(
flat_id
,
block_output
,
items
,
valid
);
}
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_BLOCK_BLOCK_STORE_HPP_
3rdparty/cub/rocprim/block/block_store_func.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
#define ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup blockmodule
/// @{
/// \brief Stores a blocked arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
template
<
class
OutputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_store_direct_blocked
(
unsigned
int
flat_id
,
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
])
{
static_assert
(
std
::
is_assignable
<
decltype
(
block_output
[
0
]),
T
>::
value
,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T."
);
unsigned
int
offset
=
flat_id
*
ItemsPerThread
;
OutputIterator
thread_iter
=
block_output
+
offset
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
thread_iter
[
item
]
=
items
[
item
];
}
}
/// \brief Stores a blocked arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
/// \param valid - maximum range of valid numbers to store
template
<
class
OutputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_store_direct_blocked
(
unsigned
int
flat_id
,
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
static_assert
(
std
::
is_assignable
<
decltype
(
block_output
[
0
]),
T
>::
value
,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T."
);
unsigned
int
offset
=
flat_id
*
ItemsPerThread
;
OutputIterator
thread_iter
=
block_output
+
offset
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
if
(
item
+
offset
<
valid
)
{
thread_iter
[
item
]
=
items
[
item
];
}
}
}
/// \brief Stores a blocked arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// The input offset (\p block_output + offset) must be quad-item aligned.
///
/// The following conditions will prevent vectorization and switch to default
/// block_load_direct_blocked:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
///
/// \tparam T - [inferred] the output data type
/// \tparam U - [inferred] the input data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// The type \p U must be such that it can be implicitly converted to \p T.
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template
<
class
T
,
class
U
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
block_store_direct_blocked_vectorized
(
unsigned
int
flat_id
,
T
*
block_output
,
U
(
&
items
)[
ItemsPerThread
])
->
typename
std
::
enable_if
<
detail
::
is_vectorizable
<
T
,
ItemsPerThread
>::
value
>::
type
{
static_assert
(
std
::
is_convertible
<
U
,
T
>::
value
,
"The type U must be such that it can be implicitly converted to T."
);
typedef
typename
detail
::
match_vector_type
<
T
,
ItemsPerThread
>::
type
vector_type
;
constexpr
unsigned
int
vectors_per_thread
=
(
sizeof
(
T
)
*
ItemsPerThread
)
/
sizeof
(
vector_type
);
vector_type
*
vectors_ptr
=
reinterpret_cast
<
vector_type
*>
(
const_cast
<
T
*>
(
block_output
));
vector_type
raw_vector_items
[
vectors_per_thread
];
T
*
raw_items
=
reinterpret_cast
<
T
*>
(
raw_vector_items
);
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
raw_items
[
item
]
=
items
[
item
];
}
block_store_direct_blocked
(
flat_id
,
vectors_ptr
,
raw_vector_items
);
}
template
<
class
T
,
class
U
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
block_store_direct_blocked_vectorized
(
unsigned
int
flat_id
,
T
*
block_output
,
U
(
&
items
)[
ItemsPerThread
])
->
typename
std
::
enable_if
<!
detail
::
is_vectorizable
<
T
,
ItemsPerThread
>::
value
>::
type
{
block_store_direct_blocked
(
flat_id
,
block_output
,
items
);
}
/// \brief Stores a striped arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
template
<
unsigned
int
BlockSize
,
class
OutputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_store_direct_striped
(
unsigned
int
flat_id
,
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
])
{
static_assert
(
std
::
is_assignable
<
decltype
(
block_output
[
0
]),
T
>::
value
,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T."
);
OutputIterator
thread_iter
=
block_output
+
flat_id
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
thread_iter
[
item
*
BlockSize
]
=
items
[
item
];
}
}
/// \brief Stores a striped arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
/// \param valid - maximum range of valid numbers to store
template
<
unsigned
int
BlockSize
,
class
OutputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_store_direct_striped
(
unsigned
int
flat_id
,
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
static_assert
(
std
::
is_assignable
<
decltype
(
block_output
[
0
]),
T
>::
value
,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T."
);
OutputIterator
thread_iter
=
block_output
+
flat_id
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
unsigned
int
offset
=
item
*
BlockSize
;
if
(
flat_id
+
offset
<
valid
)
{
thread_iter
[
offset
]
=
items
[
item
];
}
}
}
/// \brief Stores a warp-striped arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
template
<
unsigned
int
WarpSize
=
device_warp_size
(),
class
OutputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_store_direct_warp_striped
(
unsigned
int
flat_id
,
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
])
{
static_assert
(
std
::
is_assignable
<
decltype
(
block_output
[
0
]),
T
>::
value
,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T."
);
static_assert
(
detail
::
is_power_of_two
(
WarpSize
)
&&
WarpSize
<=
device_warp_size
(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp."
);
unsigned
int
thread_id
=
detail
::
logical_lane_id
<
WarpSize
>
();
unsigned
int
warp_id
=
flat_id
/
WarpSize
;
unsigned
int
warp_offset
=
warp_id
*
WarpSize
*
ItemsPerThread
;
OutputIterator
thread_iter
=
block_output
+
thread_id
+
warp_offset
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
thread_iter
[
item
*
WarpSize
]
=
items
[
item
];
}
}
/// \brief Stores a warp-striped arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
/// \param valid - maximum range of valid numbers to store
template
<
unsigned
int
WarpSize
=
device_warp_size
(),
class
OutputIterator
,
class
T
,
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
block_store_direct_warp_striped
(
unsigned
int
flat_id
,
OutputIterator
block_output
,
T
(
&
items
)[
ItemsPerThread
],
unsigned
int
valid
)
{
static_assert
(
std
::
is_assignable
<
decltype
(
block_output
[
0
]),
T
>::
value
,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T."
);
static_assert
(
detail
::
is_power_of_two
(
WarpSize
)
&&
WarpSize
<=
device_warp_size
(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp."
);
unsigned
int
thread_id
=
detail
::
logical_lane_id
<
WarpSize
>
();
unsigned
int
warp_id
=
flat_id
/
WarpSize
;
unsigned
int
warp_offset
=
warp_id
*
WarpSize
*
ItemsPerThread
;
OutputIterator
thread_iter
=
block_output
+
thread_id
+
warp_offset
;
ROCPRIM_UNROLL
for
(
unsigned
int
item
=
0
;
item
<
ItemsPerThread
;
item
++
)
{
unsigned
int
offset
=
item
*
WarpSize
;
if
(
warp_offset
+
thread_id
+
offset
<
valid
)
{
thread_iter
[
offset
]
=
items
[
item
];
}
}
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
3rdparty/cub/rocprim/block/detail/block_adjacent_difference_impl.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_
#define ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_
#include "../../config.hpp"
#include "../../detail/various.hpp"
#include "../../intrinsics/thread.hpp"
#include <type_traits>
#include <cassert>
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
// Wrapping function that allows to call BinaryFunction of any of these signatures:
// with b_index (a, b, b_index) or without it (a, b).
// Only in the case of discontinuity (when flags_style is true) is the operator allowed to take an
// index
// block_discontinuity and block_adjacent difference only differ in their implementations by the
// order the operators parameters are passed, so this method deals with this as well
template
<
class
T
,
class
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
apply
(
BinaryFunction
op
,
const
T
&
a
,
const
T
&
b
,
unsigned
int
index
,
bool_constant
<
true
>
/*as_flags*/
,
bool_constant
<
false
>
/*reversed*/
)
->
decltype
(
op
(
b
,
a
,
index
))
{
return
op
(
a
,
b
,
index
);
}
template
<
class
T
,
class
BinaryFunction
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
apply
(
BinaryFunction
op
,
const
T
&
a
,
const
T
&
b
,
unsigned
int
index
,
bool_constant
<
true
>
/*as_flags*/
,
bool_constant
<
true
>
/*reversed*/
)
->
decltype
(
op
(
b
,
a
,
index
))
{
return
op
(
b
,
a
,
index
);
}
template
<
typename
T
,
typename
BinaryFunction
,
bool
AsFlags
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
apply
(
BinaryFunction
op
,
const
T
&
a
,
const
T
&
b
,
unsigned
int
,
bool_constant
<
AsFlags
>
/*as_flags*/
,
bool_constant
<
false
>
/*reversed*/
)
->
decltype
(
op
(
b
,
a
))
{
return
op
(
a
,
b
);
}
template
<
typename
T
,
typename
BinaryFunction
,
bool
AsFlags
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
apply
(
BinaryFunction
op
,
const
T
&
a
,
const
T
&
b
,
unsigned
int
,
bool_constant
<
AsFlags
>
/*as_flags*/
,
bool_constant
<
true
>
/*reversed*/
)
->
decltype
(
op
(
b
,
a
))
{
return
op
(
b
,
a
);
}
template
<
typename
T
,
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
=
1
,
unsigned
int
BlockSizeZ
=
1
>
class
block_adjacent_difference_impl
{
public:
static
constexpr
unsigned
int
BlockSize
=
BlockSizeX
*
BlockSizeY
*
BlockSizeZ
;
struct
storage_type
{
T
items
[
BlockSize
];
};
template
<
bool
AsFlags
,
bool
Reversed
,
bool
WithTilePredecessor
,
unsigned
int
ItemsPerThread
,
typename
Output
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
void
apply_left
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
BinaryFunction
op
,
const
T
tile_predecessor_item
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
bool_constant
<
AsFlags
>
{};
static
constexpr
auto
reversed
=
bool_constant
<
Reversed
>
{};
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
// Save the last item of each thread
storage
.
items
[
flat_id
]
=
input
[
ItemsPerThread
-
1
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
ItemsPerThread
-
1
;
i
>
0
;
--
i
)
{
output
[
i
]
=
detail
::
apply
(
op
,
input
[
i
-
1
],
input
[
i
],
flat_id
*
ItemsPerThread
+
i
,
as_flags
,
reversed
);
}
::
rocprim
::
syncthreads
();
if
ROCPRIM_IF_CONSTEXPR
(
WithTilePredecessor
)
{
T
predecessor_item
=
tile_predecessor_item
;
if
(
flat_id
!=
0
)
{
predecessor_item
=
storage
.
items
[
flat_id
-
1
];
}
output
[
0
]
=
detail
::
apply
(
op
,
predecessor_item
,
input
[
0
],
flat_id
*
ItemsPerThread
,
as_flags
,
reversed
);
}
else
{
output
[
0
]
=
get_default_item
(
input
,
0
,
as_flags
);
if
(
flat_id
!=
0
)
{
output
[
0
]
=
detail
::
apply
(
op
,
storage
.
items
[
flat_id
-
1
],
input
[
0
],
flat_id
*
ItemsPerThread
,
as_flags
,
reversed
);
}
}
}
template
<
bool
AsFlags
,
bool
Reversed
,
bool
WithTilePredecessor
,
unsigned
int
ItemsPerThread
,
typename
Output
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
void
apply_left_partial
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
BinaryFunction
op
,
const
T
tile_predecessor_item
,
const
unsigned
int
valid_items
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
bool_constant
<
AsFlags
>
{};
static
constexpr
auto
reversed
=
bool_constant
<
Reversed
>
{};
assert
(
valid_items
<=
BlockSize
*
ItemsPerThread
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
// Save the last item of each thread
storage
.
items
[
flat_id
]
=
input
[
ItemsPerThread
-
1
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
ItemsPerThread
-
1
;
i
>
0
;
--
i
)
{
const
unsigned
int
index
=
flat_id
*
ItemsPerThread
+
i
;
output
[
i
]
=
get_default_item
(
input
,
i
,
as_flags
);
if
(
index
<
valid_items
)
{
output
[
i
]
=
detail
::
apply
(
op
,
input
[
i
-
1
],
input
[
i
],
index
,
as_flags
,
reversed
);
}
}
::
rocprim
::
syncthreads
();
const
unsigned
int
index
=
flat_id
*
ItemsPerThread
;
if
ROCPRIM_IF_CONSTEXPR
(
WithTilePredecessor
)
{
T
predecessor_item
=
tile_predecessor_item
;
if
(
flat_id
!=
0
)
{
predecessor_item
=
storage
.
items
[
flat_id
-
1
];
}
output
[
0
]
=
get_default_item
(
input
,
0
,
as_flags
);
if
(
index
<
valid_items
)
{
output
[
0
]
=
detail
::
apply
(
op
,
predecessor_item
,
input
[
0
],
index
,
as_flags
,
reversed
);
}
}
else
{
output
[
0
]
=
get_default_item
(
input
,
0
,
as_flags
);
if
(
flat_id
!=
0
&&
index
<
valid_items
)
{
output
[
0
]
=
detail
::
apply
(
op
,
storage
.
items
[
flat_id
-
1
],
input
[
0
],
flat_id
*
ItemsPerThread
,
as_flags
,
reversed
);
}
}
}
template
<
bool
AsFlags
,
bool
Reversed
,
bool
WithTileSuccessor
,
unsigned
int
ItemsPerThread
,
typename
Output
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
void
apply_right
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
BinaryFunction
op
,
const
T
tile_successor_item
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
bool_constant
<
AsFlags
>
{};
static
constexpr
auto
reversed
=
bool_constant
<
Reversed
>
{};
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
// Save the first item of each thread
storage
.
items
[
flat_id
]
=
input
[
0
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
-
1
;
++
i
)
{
output
[
i
]
=
detail
::
apply
(
op
,
input
[
i
],
input
[
i
+
1
],
flat_id
*
ItemsPerThread
+
i
+
1
,
as_flags
,
reversed
);
}
::
rocprim
::
syncthreads
();
if
ROCPRIM_IF_CONSTEXPR
(
WithTileSuccessor
)
{
T
successor_item
=
tile_successor_item
;
if
(
flat_id
!=
BlockSize
-
1
)
{
successor_item
=
storage
.
items
[
flat_id
+
1
];
}
output
[
ItemsPerThread
-
1
]
=
detail
::
apply
(
op
,
input
[
ItemsPerThread
-
1
],
successor_item
,
flat_id
*
ItemsPerThread
+
ItemsPerThread
,
as_flags
,
reversed
);
}
else
{
output
[
ItemsPerThread
-
1
]
=
get_default_item
(
input
,
ItemsPerThread
-
1
,
as_flags
);
if
(
flat_id
!=
BlockSize
-
1
)
{
output
[
ItemsPerThread
-
1
]
=
detail
::
apply
(
op
,
input
[
ItemsPerThread
-
1
],
storage
.
items
[
flat_id
+
1
],
flat_id
*
ItemsPerThread
+
ItemsPerThread
,
as_flags
,
reversed
);
}
}
}
template
<
bool
AsFlags
,
bool
Reversed
,
unsigned
int
ItemsPerThread
,
typename
Output
,
typename
BinaryFunction
>
ROCPRIM_DEVICE
void
apply_right_partial
(
const
T
(
&
input
)[
ItemsPerThread
],
Output
(
&
output
)[
ItemsPerThread
],
BinaryFunction
op
,
const
unsigned
int
valid_items
,
storage_type
&
storage
)
{
static
constexpr
auto
as_flags
=
bool_constant
<
AsFlags
>
{};
static
constexpr
auto
reversed
=
bool_constant
<
Reversed
>
{};
assert
(
valid_items
<=
BlockSize
*
ItemsPerThread
);
const
unsigned
int
flat_id
=
::
rocprim
::
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
();
// Save the first item of each thread
storage
.
items
[
flat_id
]
=
input
[
0
];
ROCPRIM_UNROLL
for
(
unsigned
int
i
=
0
;
i
<
ItemsPerThread
-
1
;
++
i
)
{
const
unsigned
int
index
=
flat_id
*
ItemsPerThread
+
i
+
1
;
output
[
i
]
=
get_default_item
(
input
,
i
,
as_flags
);
if
(
index
<
valid_items
)
{
output
[
i
]
=
detail
::
apply
(
op
,
input
[
i
],
input
[
i
+
1
],
index
,
as_flags
,
reversed
);
}
}
::
rocprim
::
syncthreads
();
output
[
ItemsPerThread
-
1
]
=
get_default_item
(
input
,
ItemsPerThread
-
1
,
as_flags
);
const
unsigned
int
next_thread_index
=
flat_id
*
ItemsPerThread
+
ItemsPerThread
;
if
(
next_thread_index
<
valid_items
)
{
output
[
ItemsPerThread
-
1
]
=
detail
::
apply
(
op
,
input
[
ItemsPerThread
-
1
],
storage
.
items
[
flat_id
+
1
],
next_thread_index
,
as_flags
,
reversed
);
}
}
private:
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
int
get_default_item
(
const
T
(
&
)[
ItemsPerThread
],
unsigned
int
/*index*/
,
bool_constant
<
true
>
/*as_flags*/
)
{
return
1
;
}
template
<
unsigned
int
ItemsPerThread
>
ROCPRIM_DEVICE
T
get_default_item
(
const
T
(
&
input
)[
ItemsPerThread
],
const
unsigned
int
index
,
bool_constant
<
false
>
/*as_flags*/
)
{
return
input
[
index
];
}
};
}
// namespace detail
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment