// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_WARP_HPP_
#define ROCPRIM_INTRINSICS_WARP_HPP_
#include "../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup intrinsicsmodule
/// @{
/// Evaluate predicate for all active work-items in the warp and return an integer
/// whose i-th bit is set if and only if \p predicate is true
/// for the i-th thread of the warp and the i-th thread is active.
///
/// \param predicate - input to be evaluated for all active lanes
ROCPRIM_DEVICE ROCPRIM_INLINE
lane_mask_type ballot(int predicate)
{
return ::__ballot(predicate);
}
/// \brief Masked bit count
///
/// For each thread, this function returns the number of active threads which
/// have i-th bit of \p x set and come before the current thread.
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int masked_bit_count(lane_mask_type x, unsigned int add = 0)
{
int c;
#ifndef __HIP_CPU_RT__
#if __AMDGCN_WAVEFRONT_SIZE == 32
#ifdef __CUDACC__
c = ::__builtin_amdgcn_mbcnt_lo(x, add);
#else
c = ::__mbcnt_lo(x, add);
#endif
#else
#ifdef __CUDACC__
c = ::__builtin_amdgcn_mbcnt_lo(static_cast(x), add);
c = ::__builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), c);
#else
c = ::__mbcnt_lo(static_cast(x), add);
c = ::__mbcnt_hi(static_cast(x >> 32), c);
#endif
#endif
#else
using namespace hip::detail;
const auto tidx{id(Fiber::this_fiber()) % warpSize};
std::bitset bits{x >> (warpSize - tidx)};
c = static_cast(bits.count()) + add;
#endif
return c;
}
namespace detail
{
ROCPRIM_DEVICE ROCPRIM_INLINE
int warp_any(int predicate)
{
#ifndef __HIP_CPU_RT__
return ::__any(predicate);
#else
using namespace hip::detail;
const auto tidx{id(Fiber::this_fiber()) % warpSize};
auto& lds{Tile::scratchpad, 1>()[0]};
lds[tidx] = static_cast(predicate);
barrier(Tile::this_tile());
return lds.any();
#endif
}
ROCPRIM_DEVICE ROCPRIM_INLINE
int warp_all(int predicate)
{
#ifndef __HIP_CPU_RT__
return ::__all(predicate);
#else
using namespace hip::detail;
const auto tidx{id(Fiber::this_fiber()) % warpSize};
auto& lds{Tile::scratchpad, 1>()[0]};
lds[tidx] = static_cast(predicate);
barrier(Tile::this_tile());
return lds.all();
#endif
}
} // end detail namespace
/// @}
// end of group intrinsicsmodule
/**
* Compute a 32b mask of threads having the same least-significant
* LABEL_BITS of \p label as the calling thread.
*/
template
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int MatchAny(unsigned int label)
{
unsigned int retval;
// Extract masks of common threads for each bit
ROCPRIM_UNROLL
for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
{
unsigned long long mask;
unsigned long long current_bit = 1 << BIT;
mask = label & current_bit;
bool bit_match = (mask==current_bit);
mask = ballot(bit_match);
if(!bit_match)
{
mask = ! mask;
}
// Remove peers who differ
retval = (BIT == 0) ? mask : retval & mask;
}
return retval;
}
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_WARP_HPP_