/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
#ifndef __CLANG_CUDAMOCKER_INTRINSICS_H__
#define __CLANG_CUDAMOCKER_INTRINSICS_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif

// sm_30 intrinsics: __shfl_{up,down,xor}.

#define __SM_30_INTRINSICS_H__
#define __SM_30_INTRINSICS_HPP__

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-identifier"
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#pragma clang diagnostic ignored "-Wsign-conversion"
#pragma clang diagnostic ignored "-Wold-style-cast"
#pragma clang diagnostic ignored "-Wc++98-compat"
#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
#endif


__device__ static inline unsigned int __lane_id() {
    return  __builtin_amdgcn_mbcnt_hi(
        -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
}

__device__
inline
int __shfl(int var, int src_lane, int width = warpSize) {
    int self = __lane_id();
    int index = (src_lane & (width - 1)) + (self & ~(width-1));
    return __builtin_amdgcn_ds_bpermute(index << 2, var);
}

__device__
inline
short __shfl(short var, int src_lane, int width = warpSize) {
    int i = __shfl(static_cast<int>(var), src_lane, width);
    return static_cast<short>(i);
}
__device__
inline
unsigned short __shfl(unsigned short var, int src_lane, int width = warpSize) {
    union { short s; unsigned short us; } tmp;
    tmp.us = var;
    tmp.s = __shfl(tmp.s, src_lane, width);
    return tmp.us;
}

__device__
inline
unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
    union {
        int i;
        unsigned u;
        float f;
    } tmp;
    tmp.u = var;
    tmp.i = __shfl(tmp.i, src_lane, width);
    return tmp.u;
}

__device__
inline
float __shfl(float var, int src_lane, int width = warpSize) {
    union {
        int i;
        unsigned u;
        float f;
    } tmp;
    tmp.f = var;
    tmp.i = __shfl(tmp.i, src_lane, width);
    return tmp.f;
}

__device__
inline
double __shfl(double var, int src_lane, int width = warpSize) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    double tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
long __shfl(long var, int src_lane, int width = warpSize) {
#ifndef _MSC_VER
    static_assert(sizeof(long) == 2 * sizeof(int), "");
    static_assert(sizeof(long) == sizeof(uint64_t), "");

    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
#else
    static_assert(sizeof(long) == sizeof(int), "");
    return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
#endif
}
__device__
inline
unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
    #ifndef _MSC_VER
    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
    return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
    #endif
}
__device__
inline
long long __shfl(long long var, int src_lane, int width = warpSize) {
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
    static_assert(sizeof(long long) == sizeof(uint64_t), "");

    int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
    int self = __lane_id();
    int index = self - lane_delta;
    index = (index < (self & ~(width - 1))) ? self : index;
    return __builtin_amdgcn_ds_bpermute(index << 2, var);
}

__device__
inline
short __shfl_up(short var, unsigned int lane_delta, int width = warpSize) {
    int i = __shfl_up(static_cast<int>(var), lane_delta, width);
    return static_cast<short>(i);
}
__device__
inline
unsigned short __shfl_up(unsigned short var, unsigned int lane_delta, int width = warpSize) {
    union { short s; unsigned short us; } tmp;
    tmp.us = var;
    tmp.s = __shfl_up(tmp.s, lane_delta, width);
    return tmp.us;
}

__device__
inline
unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
    union {
        int i;
        unsigned u;
        float f;
    } tmp;
    tmp.u = var;
    tmp.i = __shfl_up(tmp.i, lane_delta, width);
    return tmp.u;
}

__device__
inline
float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
    union {
        int i;
        unsigned u;
        float f;
    } tmp;
    tmp.f = var;
    tmp.i = __shfl_up(tmp.i, lane_delta, width);
    return tmp.f;
}

__device__
inline
double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    double tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
long __shfl_up(long var, unsigned int lane_delta, int width = warpSize) {
#ifndef _MSC_VER
    static_assert(sizeof(long) == 2 * sizeof(int), "");
    static_assert(sizeof(long) == sizeof(uint64_t), "");

    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
#else
    static_assert(sizeof(long) == sizeof(int), "");
    return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
#endif
}

__device__
inline
unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
    return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
    #endif
}

__device__
inline
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
{
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
    static_assert(sizeof(long long) == sizeof(uint64_t), "");
    int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
    unsigned int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long long tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
    int self = __lane_id();
    int index = self + lane_delta;
    index = (int) ((self & (width - 1)) + lane_delta) >= width ? self : index;
    return __builtin_amdgcn_ds_bpermute(index << 2, var);
}

__device__
inline
short __shfl_down(short var, unsigned int lane_delta, int width = warpSize) {
    int i = __shfl_down(static_cast<int>(var), lane_delta, width);
    return static_cast<short>(i);
}
__device__
inline
unsigned short __shfl_down(unsigned short var, unsigned int lane_delta, int width = warpSize) {
    union { short s; unsigned short us; } tmp;
    tmp.us = var;
    tmp.s = __shfl_down(tmp.s, lane_delta, width);
    return tmp.us;
}

__device__
inline
unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
    union {
        int i;
        unsigned u;
        float f;
    } tmp;
    tmp.u = var;
    tmp.i = __shfl_down(tmp.i, lane_delta, width);
    return tmp.u;
}

__device__
inline
float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
    union {
        int i;
        unsigned u;
        float f;
    } tmp;
    tmp.f = var;
    tmp.i = __shfl_down(tmp.i, lane_delta, width);
    return tmp.f;
}

__device__
inline
double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    double tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
long __shfl_down(long var, unsigned int lane_delta, int width = warpSize) {
#ifndef _MSC_VER
    static_assert(sizeof(long) == 2 * sizeof(int), "");
    static_assert(sizeof(long) == sizeof(uint64_t), "");

    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
#else
    static_assert(sizeof(long) == sizeof(int), "");
    return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
#endif
}
__device__
inline
unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
    return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
    #endif
}
__device__
inline
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize) {
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
    static_assert(sizeof(long long) == sizeof(uint64_t), "");
    int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
    unsigned int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long long tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
int __shfl_xor(int var, int lane_mask, int width = warpSize) {
    int self = __lane_id();
    int index = self ^lane_mask;
    index = index >= ((self + width) & ~(width - 1)) ? self : index;
    return __builtin_amdgcn_ds_bpermute(index << 2, var);
}

__device__
inline
short __shfl_xor(short var, int lane_mask, int width = warpSize) {
    int i = __shfl_xor(static_cast<int>(var), lane_mask, width);
    return static_cast<short>(i);
}
__device__
inline
unsigned short __shfl_xor(unsigned short var, int lane_mask, int width = warpSize) {
    union { short s; unsigned short us; } tmp;
    tmp.us = var;
    tmp.s = __shfl_xor(tmp.s, lane_mask, width);
    return tmp.us;
}

__device__
inline
unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
    union {
        int i;
        unsigned u;
        float f;
    } tmp;
    tmp.u = var;
    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
    return tmp.u;
}

__device__
inline
float __shfl_xor(float var, int lane_mask, int width = warpSize) {
    union {
        int i;
        unsigned u;
        float f;
    } tmp;
    tmp.f = var;
    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
    return tmp.f;
}

__device__
inline
double __shfl_xor(double var, int lane_mask, int width = warpSize) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    double tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
long __shfl_xor(long var, int lane_mask, int width = warpSize) {
#ifndef _MSC_VER
    static_assert(sizeof(long) == 2 * sizeof(int), "");
    static_assert(sizeof(long) == sizeof(uint64_t), "");

    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
#else
    static_assert(sizeof(long) == sizeof(int), "");
    return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
#endif
}
__device__
inline
unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
    return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
    #endif
}
__device__
inline
long long __shfl_xor(long long var, int lane_mask, int width = warpSize) {
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
    static_assert(sizeof(long long) == sizeof(uint64_t), "");
    int tmp[2];
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long long tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
{
    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
    unsigned int tmp[2]; 
    __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long long tmp1;  
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300

#if CUDA_VERSION >= 9000
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
// __shfl_sync_* variants available in CUDA-9

template <typename T>
__device__
inline
T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize) {
    return __shfl(var, srcLane, width);
}

template <typename T>
__device__
inline
T __shfl_up_sync(unsigned mask, T var, unsigned int delta, int width=warpSize) {
    return __shfl_up(var, delta, width);
}

template <typename T>
__device__
inline
T __shfl_down_sync(unsigned mask, T var, unsigned int delta, int width=warpSize) {
    return __shfl_down(var, delta, width);
}

template <typename T>
__device__
inline
T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width=warpSize) {
    return __shfl_xor(var, laneMask, width);
}

__device__
inline
__attribute__((convergent))
void __syncwarp(unsigned mask=0xffffffff)
{
    return __syncthreads();
}

__device__
inline
void  __barrier_sync(unsigned id) {
  return __barrier(id);
}

/*******************************************
*          HIP is not support now          *
********************************************/
// inline __device__ void __barrier_sync_count(unsigned int id,
//                                             unsigned int count) {
//   __nvvm_barrier_sync_cnt(id, count);
// }

__device__
inline
int __all_sync(unsigned mask, int predicate) {
    return __all(predicate);
}

__device__
inline
int __any_sync(unsigned mask, int predicate) {
    return __any(predicate);
}

// inline __device__ int __uni_sync(unsigned int mask, int pred) {
//   return __nvvm_vote_uni_sync(mask, pred);
// }

__device__
inline
unsigned __ballot_sync(unsigned mask, int predicate) {
    return __ballot(predicate);
}

__device__
inline
unsigned __activemask()
{
    return 0xffffffff;
}

// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
// If not found, return -1.
__device__  static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
  uint64_t temp_mask = mask;
  int32_t temp_offset = offset;

  if (offset == 0) {
    temp_mask &= (1 << base);
    temp_offset = 1;
  }
  else if (offset < 0) {
    temp_mask = __builtin_bitreverse64(mask);
    base = 63 - base;
    temp_offset = -offset;
  }

  temp_mask = temp_mask & ((~0ULL) << base);
  if (__builtin_popcountll(temp_mask) < temp_offset)
    return -1;
  int32_t total = 0;
  for (int i = 0x20; i > 0; i >>= 1) {
    uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
    uint32_t pcnt = __builtin_popcountll(temp_mask_lo);
    if (pcnt < temp_offset) {
      temp_mask = temp_mask >> i;
      temp_offset -= pcnt;
      total += i;
    }
    else {
      temp_mask = temp_mask_lo;
    }
  }
  if (offset < 0)
    return 63 - total;
  else
    return total;
}

__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
  uint64_t temp_mask = mask;
  int32_t temp_offset = offset;
  if (offset == 0) {
    temp_mask &= (1 << base);
    temp_offset = 1;
  }
  else if (offset < 0) {
    temp_mask = __builtin_bitreverse64(mask);
    base = 63 - base;
    temp_offset = -offset;
  }
  temp_mask = temp_mask & ((~0ULL) << base);
  if (__builtin_popcountll(temp_mask) < temp_offset)
    return -1;
  int32_t total = 0;
  for (int i = 0x20; i > 0; i >>= 1) {
    uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
    uint32_t pcnt = __builtin_popcountll(temp_mask_lo);
    if (pcnt < temp_offset) {
      temp_mask = temp_mask >> i;
      temp_offset -= pcnt;
      total += i;
    }
    else {
      temp_mask = temp_mask_lo;
    }
  }
  if (offset < 0)
    return 63 - total;
  else
    return total;
}

#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300

// Define __match* builtins CUDA-9 headers expect to see.
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
// inline __device__ unsigned int __match32_any_sync(unsigned int mask,
//                                                   unsigned int value) {
//   return __nvvm_match_any_sync_i32(mask, value);
// }

// inline __device__ unsigned int
// __match64_any_sync(unsigned int mask, unsigned long long value) {
//   return __nvvm_match_any_sync_i64(mask, value);
// }

// inline __device__ unsigned int
// __match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
//   return __nvvm_match_all_sync_i32p(mask, value, pred);
// }

// inline __device__ unsigned int
// __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
//   return __nvvm_match_all_sync_i64p(mask, value, pred);
// }
// #include "crt/sm_70_rt.hpp"

#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
#endif // __CUDA_VERSION >= 9000

// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.

// Prevent the vanilla sm_32 intrinsics header from being included.
#define __SM_32_INTRINSICS_H__
#define __SM_32_INTRINSICS_HPP__

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320

__device__ inline static char __ldg(const char* ptr) { return *ptr; }

__device__ inline static char2 __ldg(const char2* ptr) { return *ptr; }

__device__ inline static char4 __ldg(const char4* ptr) { return *ptr; }

__device__ inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }

__device__ inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }


__device__ inline static short __ldg(const short* ptr) { return ptr[0]; }

__device__ inline static short2 __ldg(const short2* ptr) { return ptr[0]; }

__device__ inline static short4 __ldg(const short4* ptr) { return ptr[0]; }

__device__ inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }


__device__ inline static int __ldg(const int* ptr) { return ptr[0]; }

__device__ inline static int2 __ldg(const int2* ptr) { return ptr[0]; }

__device__ inline static int4 __ldg(const int4* ptr) { return ptr[0]; }

__device__ inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }


__device__ inline static long __ldg(const long* ptr) { return ptr[0]; }

__device__ inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }


__device__ inline static long long __ldg(const long long* ptr) { return ptr[0]; }

__device__ inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }

__device__ inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }


__device__ inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }

__device__ inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }


__device__ inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }


__device__ inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }

__device__ inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }


__device__ inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }


__device__ inline static float __ldg(const float* ptr) { return ptr[0]; }

__device__ inline static float2 __ldg(const float2* ptr) { return ptr[0]; }

__device__ inline static float4 __ldg(const float4* ptr) { return ptr[0]; }


__device__ inline static double __ldg(const double* ptr) { return ptr[0]; }

__device__ inline static double2 __ldg(const double2* ptr) { return ptr[0]; }

// TODO: Implement these as intrinsics, so the backend can work its magic on
// these.  Alternatively, we could implement these as plain C and try to get
// llvm to recognize the relevant patterns.
__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
{
    uint32_t mask_shift = shift & 31;
    return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
}

__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
{
    uint32_t min_shift = shift >= 32 ? 32 : shift;
    return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
}

__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
{
    return __builtin_amdgcn_alignbit(hi, lo, shift);
}

__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
{
    return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
}

#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320

#if CUDA_VERSION >= 11000
extern "C" {
__device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
  return (size_t)(void __attribute__((address_space(1))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
  return (size_t)(void __attribute__((address_space(3))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
  return (size_t)(void __attribute__((address_space(4))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
  return (size_t)(void __attribute__((address_space(5))) *)__ptr;
}
__device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
  return (void *)(void __attribute__((address_space(1))) *)__ptr;
}
__device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
  return (void *)(void __attribute__((address_space(3))) *)__ptr;
}
__device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
  return (void *)(void __attribute__((address_space(4))) *)__ptr;
}
__device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
  return (void *)(void __attribute__((address_space(5))) *)__ptr;
}
__device__ inline uint32_t __nvvm_get_smem_pointer(void *__ptr) {
  return __nv_cvta_generic_to_shared_impl(__ptr);
}
} // extern "C"
#endif // CUDA_VERSION >= 11000

#endif // defined(__CLANG_CUDAMOCKER_INTRINSICS_H__)
