/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */

#ifndef __CLANG__CUDAMOCKER_DEVICE_FUNCTIONS_H__
#define __CLANG__CUDAMOCKER_DEVICE_FUNCTIONS_H__

#ifndef __OPENMP_NVPTX__
#if CUDA_VERSION < 9000
#error This file is intended to be used with CUDA-9+ only.
#endif
#endif

// __DEVICE__ is a helper macro with common set of attributes for the wrappers
// we implement in this file. We need static in order to avoid emitting unused
// functions and __forceinline__ helps inlining these wrappers at -O1.
#pragma push_macro("__DEVICE__")
#ifdef __OPENMP_NVPTX__
#define __DEVICE__ static __attribute__((always_inline, nothrow))
#else
#define __DEVICE__ static __device__ __forceinline__
#endif

// XXX from llvm/include/llvm/IR/InstrTypes.h
#define ICMP_NE 33

__DEVICE__ int __all(int predicate) {
    return __ockl_wfall_i32(predicate);
}
__DEVICE__ int __any(int predicate) {
    return __ockl_wfany_i32(predicate);
}
/***********************************************
*   api with diffrences between HIP and CUDA   *
***********************************************/
// __DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); }
__DEVICE__ unsigned long long int __ballot(int predicate) {
    return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
}
__DEVICE__ unsigned int __brev(unsigned int input) {
    return __builtin_bitreverse32(input);
}
__DEVICE__ unsigned long long __brevll(unsigned long long int input) {
    return __builtin_bitreverse64(input);
}

/*******************************************
*          HIP is not support now          *
*******************************************/
// #if defined(__cplusplus)
// __DEVICE__ void __brkpt() { __asm__ __volatile__("brkpt;"); }
// __DEVICE__ void __brkpt(int __a) { __brkpt(); }
// #else
// __DEVICE__ void __attribute__((overloadable)) __brkpt(void) {
//   __asm__ __volatile__("brkpt;");
// }
// __DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); }
// #endif

struct ucharHolder {
    union {
        unsigned char c[4];
        unsigned int ui;
    };
} __attribute__((aligned(4)));

struct uchar2Holder {
    union {
        unsigned int ui[2];
        unsigned char c[8];
    };
} __attribute__((aligned(8)));

__DEVICE__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
    struct uchar2Holder cHoldVal;
    struct ucharHolder cHoldKey;
    cHoldKey.ui = s;
    cHoldVal.ui[0] = x;
    cHoldVal.ui[1] = y;
    unsigned int result;
    result = cHoldVal.c[cHoldKey.c[0] & 0x07];
    result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
    result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
    result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
    return result;
}
__DEVICE__ int __clz(int input) {
    return __ockl_clz_u32((uint) input);
}
__DEVICE__ int __clzll(long long int input) {
    return __ockl_clz_u64((uint64_t)input);
}
__DEVICE__ float __cosf(float x) { return __ocml_native_cos_f32(x); }

__DEVICE__ double __dadd_rd(double x, double y) { return __ocml_add_rtn_f64(x, y); }
__DEVICE__ double __dadd_rn(double x, double y) { return x + y; }
__DEVICE__ double __dadd_ru(double x, double y) { return __ocml_add_rtp_f64(x, y); }
__DEVICE__ double __dadd_rz(double x, double y) { return __ocml_add_rtz_f64(x, y); }
__DEVICE__ double __ddiv_rd(double x, double y) { return __ocml_div_rtn_f64(x, y); }
__DEVICE__ double __ddiv_rn(double x, double y) { return x / y; }
__DEVICE__ double __ddiv_ru(double x, double y) { return __ocml_div_rtp_f64(x, y); }
__DEVICE__ double __ddiv_rz(double x, double y) { return __ocml_div_rtz_f64(x, y); }
__DEVICE__ double __dmul_rd(double x, double y) { return __ocml_mul_rtn_f64(x, y); }
__DEVICE__ double __dmul_rn(double x, double y) { return x * y; }
__DEVICE__ double __dmul_ru(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
__DEVICE__ double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
__DEVICE__ float __double2float_rd(double x) {
    return __ocml_cvtrtn_f32_f64(x);
}
__DEVICE__ float __double2float_rn(double x) { return x; }
__DEVICE__ float __double2float_ru(double x) {
    return __ocml_cvtrtp_f32_f64(x);
}
__DEVICE__ float __double2float_rz(double x) {
    return __ocml_cvtrtz_f32_f64(x);
}
// original file has no __double2half here, add it use HIP implementation
__DEVICE__ _Float16 __double2half(const double a) {
    return __ocml_cvtrte_f16_f64(a);
}
__DEVICE__ int __double2hiint(double x) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");

    int tmp[2];
    __builtin_memcpy(tmp, &x, sizeof(tmp));

    return tmp[1];
}
__DEVICE__ int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
__DEVICE__ int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
__DEVICE__ int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
__DEVICE__ int __double2int_rz(double x) { return (int)x; }

__DEVICE__ long long int __double2ll_rd(double x) {
  return (long long)__ocml_floor_f64(x);
}
__DEVICE__ long long int __double2ll_rn(double x) {
  return (long long)__ocml_rint_f64(x);
}
__DEVICE__ long long int __double2ll_ru(double x) {
  return (long long)__ocml_ceil_f64(x);
}
__DEVICE__ long long int __double2ll_rz(double x) { return (long long)x; }

__DEVICE__ int __double2loint(double x) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");

    int tmp[2];
    __builtin_memcpy(tmp, &x, sizeof(tmp));

    return tmp[0];
}
__DEVICE__ unsigned int __double2uint_rd(double x) {
  return (unsigned int)__ocml_floor_f64(x);
}
__DEVICE__ unsigned int __double2uint_rn(double x) {
  return (unsigned int)__ocml_rint_f64(x);
}
__DEVICE__ unsigned int __double2uint_ru(double x) {
  return (unsigned int)__ocml_ceil_f64(x);
}
__DEVICE__ unsigned int __double2uint_rz(double x) { return (unsigned int)x; }

__DEVICE__ unsigned long long int __double2ull_rd(double x) {
  return (unsigned long long int)__ocml_floor_f64(x);
}
__DEVICE__ unsigned long long int __double2ull_rn(double x) {
  return (unsigned long long int)__ocml_rint_f64(x);
}
__DEVICE__ unsigned long long int __double2ull_ru(double x) {
  return (unsigned long long int)__ocml_ceil_f64(x);
}
__DEVICE__ unsigned long long int __double2ull_rz(double x) {
  return (unsigned long long int)x;
}

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
#endif
__device__ static inline long long int __double_as_longlong(double x) {
    static_assert(sizeof(long long) == sizeof(double), "");

    long long tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}
#if defined(__clang__)
#pragma clang diagnostic pop
#endif

__DEVICE__ double __drcp_rd(double x) { return __builtin_amdgcn_rcp(x); }
__DEVICE__ double __drcp_rn(double x) { return __builtin_amdgcn_rcp(x); }
__DEVICE__ double __drcp_ru(double x) { return __builtin_amdgcn_rcp(x); }
__DEVICE__ double __drcp_rz(double x) { return __builtin_amdgcn_rcp(x); }
__DEVICE__ double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
__DEVICE__ double __dsqrt_rn(double x) { return __ocml_sqrt_f64(x); }
__DEVICE__ double __dsqrt_ru(double x) { return __ocml_sqrt_rtp_f64(x); }
__DEVICE__ double __dsqrt_rz(double x) { return __ocml_sqrt_rtz_f64(x); }
__DEVICE__ double __dsub_rd(double x, double y) { return __ocml_sub_rtn_f64(x, y); }
__DEVICE__ double __dsub_rn(double x, double y) { return x - y; }
__DEVICE__ double __dsub_ru(double x, double y) { return __ocml_sub_rtp_f64(x, y); }
__DEVICE__ double __dsub_rz(double x, double y) { return __ocml_sub_rtz_f64(x, y); }
__DEVICE__ float __exp10f(float x) { return __ocml_native_exp10_f32(x); }
__DEVICE__ float __expf(float x) { return __ocml_native_exp_f32(x); }

__DEVICE__ float __fadd_rd(float x, float y) { return __ocml_add_rtn_f32(x, y); }
__DEVICE__ float __fadd_rn(float x, float y) { return x + y; }
__DEVICE__ float __fadd_ru(float x, float y) { return __ocml_add_rtp_f32(x, y); }
__DEVICE__ float __fadd_rz(float x, float y) { return __ocml_add_rtz_f32(x, y); }
__DEVICE__ float __fdiv_rd(float x, float y) { return __ocml_div_rtn_f32(x, y); }
__DEVICE__ float __fdiv_rn(float x, float y) { return x / y; }
__DEVICE__ float __fdiv_ru(float x, float y) { return __ocml_div_rtp_f32(x, y); }
__DEVICE__ float __fdiv_rz(float x, float y) { return __ocml_div_rtz_f32(x, y); }
__DEVICE__ float __fdividef(float x, float y) { return x / y; }

/***********************************************
*   api with diffrences between HIP and CUDA   *
***********************************************/
// __DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); }
// __DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); }
__DEVICE__ unsigned int __ffs(unsigned int input) {
    return (input == 0 ? -1 : __builtin_ctz(input)) + 1;
}
__DEVICE__ unsigned int __ffs(unsigned long long int input) {
    return (input == 0 ? -1 : __builtin_ctz(input)) + 1;
}
__DEVICE__ unsigned int __ffsll(unsigned long long int input) {
    return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
}
__DEVICE__ unsigned int __ffs(int input) {
    return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
}
__DEVICE__ unsigned int __ffsll(long long int input) {
    return (input == 0 ? -1 : __builtin_ctzll(input)) + 1;
}
__DEVICE__ int __finite(double x) { return __ocml_isfinite_f64(x); }
__DEVICE__ int __finitef(float x) { return __ocml_isfinite_f32(x); }
#ifdef _MSC_VER
__DEVICE__ int __finitel(long double __a);
#endif
__DEVICE__ int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
__DEVICE__ int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
__DEVICE__ int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
__DEVICE__ int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
__DEVICE__ long long int __float2ll_rd(float x) {
  return (long long int)__ocml_floor_f32(x);
}
__DEVICE__ long long int __float2ll_rn(float x) {
  return (long long int)__ocml_rint_f32(x);
}
__DEVICE__ long long int __float2ll_ru(float x) {
  return (long long int)__ocml_ceil_f32(x);
}
__DEVICE__ long long int __float2ll_rz(float x) { return (long long int)x; }
__DEVICE__ unsigned int __float2uint_rd(float x) {
  return (unsigned int)__ocml_floor_f32(x);
}
__DEVICE__ unsigned int __float2uint_rn(float x) {
  return (unsigned int)__ocml_rint_f32(x);
}
__DEVICE__ unsigned int __float2uint_ru(float x) {
  return (unsigned int)__ocml_ceil_f32(x);
}
__DEVICE__ unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
__DEVICE__ unsigned long long int __float2ull_rd(float x) {
  return (unsigned long long int)__ocml_floor_f32(x);
}
__DEVICE__ unsigned long long int __float2ull_rn(float x) {
  return (unsigned long long int)__ocml_rint_f32(x);
}
__DEVICE__ unsigned long long int __float2ull_ru(float x) {
  return (unsigned long long int)__ocml_ceil_f32(x);
}
__DEVICE__ unsigned long long int __float2ull_rz(float x) {
    return (unsigned long long int)x;
}
__DEVICE__ int __float_as_int(float x) {
    static_assert(sizeof(int) == sizeof(float), "");

    int tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}
__DEVICE__ unsigned int __float_as_uint(float x) {
    static_assert(sizeof(unsigned int) == sizeof(float), "");

    unsigned int tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}
__DEVICE__ double __fma_rd(double x, double y, double z)
{
    return __ocml_fma_rtn_f64(x, y, z);
}
__DEVICE__ double __fma_rn(double x, double y, double z)
{
    return __ocml_fma_f64(x, y, z);
}
__DEVICE__ double __fma_ru(double x, double y, double z)
{
    return __ocml_fma_rtp_f64(x, y, z);
}
__DEVICE__ double __fma_rz(double x, double y, double z)
{
    return __ocml_fma_rtz_f64(x, y, z);
}
/*******************************************
*          HIP is not support now          *
********************************************/
// __DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) {
//   return __nv_fmaf_ieee_rd(__a, __b, __c);
// }
// __DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) {
//   return __nv_fmaf_ieee_rn(__a, __b, __c);
// }
// __DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) {
//   return __nv_fmaf_ieee_ru(__a, __b, __c);
// }
// __DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) {
//   return __nv_fmaf_ieee_rz(__a, __b, __c);
// }
__DEVICE__ float __fmaf_rd(float x, float y, float z)
{
    return __ocml_fma_rtn_f32(x, y, z);
}
__DEVICE__ float __fmaf_rn(float x, float y, float z)
{
    return __ocml_fma_f32(x, y, z);
}
__DEVICE__ float __fmaf_ru(float x, float y, float z)
{
    return __ocml_fma_rtp_f32(x, y, z);
}
__DEVICE__ float __fmaf_rz(float x, float y, float z)
{
   return __ocml_fma_rtz_f32(x, y, z);
}
__DEVICE__ float __fmul_rd(float x, float y) { return __ocml_mul_rtn_f32(x, y); }
__DEVICE__ float __fmul_rn(float x, float y) { return x * y; }
__DEVICE__ float __fmul_ru(float x, float y)  { return __ocml_mul_rtp_f32(x, y); }
__DEVICE__ float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
__DEVICE__ float __frcp_rd(float x) { return __builtin_amdgcn_rcpf(x); }
__DEVICE__ float __frcp_rn(float x) { return __builtin_amdgcn_rcpf(x); }
__DEVICE__ float __frcp_ru(float x) { return __builtin_amdgcn_rcpf(x); }
__DEVICE__ float __frcp_rz(float x) { return __builtin_amdgcn_rcpf(x); }
__DEVICE__ float __frsqrt_rn(float x) { return __builtin_amdgcn_rsqf(x); }
__DEVICE__ float __fsqrt_rd(float x) { return __ocml_sqrt_rtn_f32(x); }
__DEVICE__ float __fsqrt_rn(float x) { return __ocml_native_sqrt_f32(x); }
__DEVICE__ float __fsqrt_ru(float x) { return __ocml_sqrt_rtp_f32(x); }
__DEVICE__ float __fsqrt_rz(float x) { return __ocml_sqrt_rtz_f32(x); }
__DEVICE__ float __fsub_rd(float x, float y) { return __ocml_sub_rtn_f32(x, y); }
__DEVICE__ float __fsub_rn(float x, float y) { return x - y; }
__DEVICE__ float __fsub_ru(float x, float y) { return __ocml_sub_rtp_f32(x, y); }
__DEVICE__ float __fsub_rz(float x, float y) { return __ocml_sub_rtz_f32(x, y); }

/***********************************************
*   api with diffrences between HIP and CUDA   *
***********************************************/
// __DEVICE__ int __hadd(int __a, int __b) { return __nv_hadd(__a, __b); }
__DEVICE__ unsigned int __hadd(int x, int y) {
    int z = x + y;
    int sign = z & 0x8000000;
    int value = z & 0x7FFFFFFF;
    return ((value) >> 1 || sign);
}
__DEVICE__ double __hiloint2double(int hi, int lo) {
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
    double tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));

    return tmp1;
}

__DEVICE__ double __int2double_rn(int x) { return (double)x; }

__DEVICE__ float __int2float_rd(int x) {
    return __ocml_cvtrtn_f32_s32(x);
}
__DEVICE__ float __int2float_rn(int x) { return (float)x; }
__DEVICE__ float __int2float_ru(int x) {
    return __ocml_cvtrtp_f32_s32(x);
}
__DEVICE__ float __int2float_rz(int x) {
    return __ocml_cvtrtz_f32_s32(x);
}

__DEVICE__ float __int_as_float(int x) {
    static_assert(sizeof(float) == sizeof(int), "");

    float tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}
/*******************************************
*          HIP is not support now          *
********************************************/
// __DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); }
__DEVICE__ int __isfinite(double x) { return __ocml_isfinite_f64(x); }
__DEVICE__ int __isinf(double x) { return __ocml_isinf_f64(x); }
__DEVICE__ int __isinff(float x) { return __ocml_isinf_f32(x); }
#ifdef _MSC_VER
__DEVICE__ int __isinfl(long double __a);
#endif
__DEVICE__ int __isnan(double x) { return __ocml_isnan_f64(x); }
__DEVICE__ int __isnanf(float x) { return __ocml_isnan_f32(x); }
#ifdef _MSC_VER
__DEVICE__ int __isnanl(long double __a);
#endif
__DEVICE__ double __ll2double_rd(long long int x) {
    return __ocml_cvtrtn_f64_s64(x);
}
__DEVICE__ double __ll2double_rn(long long int x) { return (double)x; }
__DEVICE__ double __ll2double_ru(long long int x) {
    return __ocml_cvtrtp_f64_s64(x);
}
__DEVICE__ double __ll2double_rz(long long int x) {
    return __ocml_cvtrtz_f64_s64(x);
}

__DEVICE__ float __ll2float_rd(long long int x) {
    return __ocml_cvtrtn_f32_s64(x);
}
__DEVICE__ float __ll2float_rn(long long int x) { return (float)x; }
__DEVICE__ float __ll2float_ru(long long int x) {
    return __ocml_cvtrtp_f32_s64(x);
}
__DEVICE__ float __ll2float_rz(long long int x) {
    return __ocml_cvtrtz_f32_s64(x);
}

__DEVICE__ float __log10f(float x) { return __ocml_native_log10_f32(x); }
__DEVICE__ float __log2f(float x) { return __ocml_native_log2_f32(x); }
__DEVICE__ float __logf(float x) { return __ocml_native_log_f32(x); }
__DEVICE__ double __longlong_as_double(long long int x) {
    static_assert(sizeof(double) == sizeof(long long), "");

    double tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}
__DEVICE__ int __mul24(int x, int y) {
    return __ockl_mul24_i32(x, y);
}
__DEVICE__ long long int __mul64hi(long long int x, long long int y) {
    ulong x0 = (ulong)x & 0xffffffffUL;
    long x1 = x >> 32;
    ulong y0 = (ulong)y & 0xffffffffUL;
    long y1 = y >> 32;
    ulong z0 = x0 * y0;
    long t = x1 * y0 + (z0 >> 32);
    long z1 = t & 0xffffffffL;
    long z2 = t >> 32;
    z1 = x0 * y1 + z1;
    return x1 * y1 + z2 + (z1 >> 32);
}
__DEVICE__ int __mulhi(int x, int y) {
    return __ockl_mul_hi_i32(x, y);
}
/*******************************************
*          HIP is not support now          *
********************************************/
// __DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); }
// __DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); }
// __DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); }
// __DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); }

/***********************************************
*   api with diffrences between HIP and CUDA   *
***********************************************/
// __DEVICE__ int __popc(int __a) { return __nv_popc(__a); }
__DEVICE__ unsigned int __popc(unsigned int input) {
    return __builtin_popcount(input);
}
// __DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); }
__DEVICE__ unsigned int __popcll(unsigned long long int input) {
    return __builtin_popcountll(input);
}
__DEVICE__ float __powf(float x, float y) { return __ocml_pow_f32(x, y); }

// Parameter must have a known integer value.
/*******************************************
*          HIP is not support now          *
********************************************/
// #define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a))
__DEVICE__ int __rhadd(int x, int y) {
    int z = x + y + 1;
    int sign = z & 0x8000000;
    int value = z & 0x7FFFFFFF;
    return ((value) >> 1 || sign);
}
__DEVICE__ unsigned int __sad(int x, int y, unsigned int z) {
    return x > y ? x - y + z : y - x + z;
}
__DEVICE__ float __saturatef(float x) { return (x < 0) ? 0 : ((x > 1) ? 1 : x); }
__DEVICE__ int __signbit(double x) { return __ocml_signbit_f64(x); }
__DEVICE__ int __signbitf(float x) { return __ocml_signbit_f32(x); }
__DEVICE__ void __sincosf(float x, float* sptr, float* cptr)
{
    *sptr = __ocml_native_sin_f32(x);
    *cptr = __ocml_native_cos_f32(x);
}
__DEVICE__ float __sinf(float x) { return __ocml_native_sin_f32(x); }
__DEVICE__ int __syncthreads_and(int predicate)
{
  return __ockl_wgred_and_i32(!!predicate);
}
__DEVICE__ int __syncthreads_count(int predicate)
{
  return __ockl_wgred_add_i32(!!predicate);
}
__DEVICE__ int __syncthreads_or(int predicate)
{
  return __ockl_wgred_or_i32(!!predicate);
}
__DEVICE__ float __tanf(float x) { return __ocml_tan_f32(x); }
__DEVICE__ void __threadfence(void) {
    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
}
__DEVICE__ void __threadfence_block(void) {
    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
}
__DEVICE__ void __threadfence_system(void) {
    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
}
/*******************************************
*     HIP is not support `__trap` now      *
********************************************/
// __DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }

__DEVICE__ unsigned int __uhadd(unsigned int x, unsigned int y) {
    return (x + y) >> 1;
}
__DEVICE__ double __uint2double_rn(unsigned int x) { return (double)x; }

__DEVICE__ float __uint2float_rd(unsigned int x) {
    return __ocml_cvtrtn_f32_u32(x);
}
__DEVICE__ float __uint2float_rn(unsigned int x) { return (float)x; }
__DEVICE__ float __uint2float_ru(unsigned int x) {
    return __ocml_cvtrtp_f32_u32(x);
}
__DEVICE__ float __uint2float_rz(unsigned int x) {
    return __ocml_cvtrtz_f32_u32(x);
}
__DEVICE__ float __uint_as_float(unsigned int x) {
   static_assert(sizeof(float) == sizeof(unsigned int), "");

    float tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}

__DEVICE__ double __ull2double_rd(unsigned long long int x) {
    return __ocml_cvtrtn_f64_u64(x);
}
__DEVICE__ double __ull2double_rn(unsigned long long int x) { return (double)x; }
__DEVICE__ double __ull2double_ru(unsigned long long int x) {
    return __ocml_cvtrtp_f64_u64(x);
}
__DEVICE__ double __ull2double_rz(unsigned long long int x) {
    return __ocml_cvtrtz_f64_u64(x);
}

__DEVICE__ float __ull2float_rd(unsigned long long int x) {
    return __ocml_cvtrtn_f32_u64(x);
}
__DEVICE__ float __ull2float_rn(unsigned long long int x) { return (float)x; }
__DEVICE__ float __ull2float_ru(unsigned long long int x) {
    return __ocml_cvtrtp_f32_u64(x);
}
__DEVICE__ float __ull2float_rz(unsigned long long int x) {
    return __ocml_cvtrtz_f32_u64(x);
}

__DEVICE__ unsigned int __umul24(unsigned int x, unsigned int y) {
    return reinterpret_cast<unsigned int>(__ockl_mul24_u32(x, y));
}
__DEVICE__ unsigned long long int __umul64hi(unsigned long long int x,
                                            unsigned long long int y) {
    ulong x0 = x & 0xffffffffUL;
    ulong x1 = x >> 32;
    ulong y0 = y & 0xffffffffUL;
    ulong y1 = y >> 32;
    ulong z0 = x0 * y0;
    ulong t = x1 * y0 + (z0 >> 32);
    ulong z1 = t & 0xffffffffUL;
    ulong z2 = t >> 32;
    z1 = x0 * y1 + z1;
    return x1 * y1 + z2 + (z1 >> 32);
}
__DEVICE__ unsigned int __umulhi(unsigned int x, unsigned int y) {
    return __ockl_mul_hi_u32(x, y);
}
__DEVICE__ unsigned int __urhadd(unsigned int x, unsigned int y) {
    return (x + y + 1) >> 1;
}
__DEVICE__ unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
    return __ockl_sadd_u32(x, y, z);
}

/*******************************************
*          HIP is not support now          *
********************************************/
#if 0
#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
__DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); }
__DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); }
__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
  return __nv_vabsdiffs2(__a, __b);
}
__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
  return __nv_vabsdiffs4(__a, __b);
}
__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
  return __nv_vabsdiffu2(__a, __b);
}
__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
  return __nv_vabsdiffu4(__a, __b);
}
__DEVICE__ unsigned int __vabsss2(unsigned int __a) {
  return __nv_vabsss2(__a);
}
__DEVICE__ unsigned int __vabsss4(unsigned int __a) {
  return __nv_vabsss4(__a);
}
__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
  return __nv_vadd2(__a, __b);
}
__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
  return __nv_vadd4(__a, __b);
}
__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
  return __nv_vaddss2(__a, __b);
}
__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
  return __nv_vaddss4(__a, __b);
}
__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
  return __nv_vaddus2(__a, __b);
}
__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
  return __nv_vaddus4(__a, __b);
}
__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
  return __nv_vavgs2(__a, __b);
}
__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
  return __nv_vavgs4(__a, __b);
}
__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
  return __nv_vavgu2(__a, __b);
}
__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
  return __nv_vavgu4(__a, __b);
}
__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
  return __nv_vcmpeq2(__a, __b);
}
__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
  return __nv_vcmpeq4(__a, __b);
}
__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
  return __nv_vcmpges2(__a, __b);
}
__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
  return __nv_vcmpges4(__a, __b);
}
__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
  return __nv_vcmpgeu2(__a, __b);
}
__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
  return __nv_vcmpgeu4(__a, __b);
}
__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
  return __nv_vcmpgts2(__a, __b);
}
__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
  return __nv_vcmpgts4(__a, __b);
}
__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
  return __nv_vcmpgtu2(__a, __b);
}
__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
  return __nv_vcmpgtu4(__a, __b);
}
__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
  return __nv_vcmples2(__a, __b);
}
__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
  return __nv_vcmples4(__a, __b);
}
__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
  return __nv_vcmpleu2(__a, __b);
}
__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
  return __nv_vcmpleu4(__a, __b);
}
__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
  return __nv_vcmplts2(__a, __b);
}
__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
  return __nv_vcmplts4(__a, __b);
}
__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
  return __nv_vcmpltu2(__a, __b);
}
__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
  return __nv_vcmpltu4(__a, __b);
}
__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
  return __nv_vcmpne2(__a, __b);
}
__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
  return __nv_vcmpne4(__a, __b);
}
__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) {
  return __nv_vhaddu2(__a, __b);
}
__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) {
  return __nv_vhaddu4(__a, __b);
}
__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
  return __nv_vmaxs2(__a, __b);
}
__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
  return __nv_vmaxs4(__a, __b);
}
__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
  return __nv_vmaxu2(__a, __b);
}
__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
  return __nv_vmaxu4(__a, __b);
}
__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
  return __nv_vmins2(__a, __b);
}
__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
  return __nv_vmins4(__a, __b);
}
__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
  return __nv_vminu2(__a, __b);
}
__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
  return __nv_vminu4(__a, __b);
}
__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __nv_vneg2(__a); }
__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __nv_vneg4(__a); }
__DEVICE__ unsigned int __vnegss2(unsigned int __a) {
  return __nv_vnegss2(__a);
}
__DEVICE__ unsigned int __vnegss4(unsigned int __a) {
  return __nv_vnegss4(__a);
}
__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
  return __nv_vsads2(__a, __b);
}
__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
  return __nv_vsads4(__a, __b);
}
__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
  return __nv_vsadu2(__a, __b);
}
__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
  return __nv_vsadu4(__a, __b);
}
__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
  return __nv_vseteq2(__a, __b);
}
__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
  return __nv_vseteq4(__a, __b);
}
__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
  return __nv_vsetges2(__a, __b);
}
__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
  return __nv_vsetges4(__a, __b);
}
__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
  return __nv_vsetgeu2(__a, __b);
}
__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
  return __nv_vsetgeu4(__a, __b);
}
__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
  return __nv_vsetgts2(__a, __b);
}
__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
  return __nv_vsetgts4(__a, __b);
}
__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
  return __nv_vsetgtu2(__a, __b);
}
__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
  return __nv_vsetgtu4(__a, __b);
}
__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
  return __nv_vsetles2(__a, __b);
}
__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
  return __nv_vsetles4(__a, __b);
}
__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
  return __nv_vsetleu2(__a, __b);
}
__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
  return __nv_vsetleu4(__a, __b);
}
__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
  return __nv_vsetlts2(__a, __b);
}
__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
  return __nv_vsetlts4(__a, __b);
}
__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
  return __nv_vsetltu2(__a, __b);
}
__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
  return __nv_vsetltu4(__a, __b);
}
__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
  return __nv_vsetne2(__a, __b);
}
__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
  return __nv_vsetne4(__a, __b);
}
__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
  return __nv_vsub2(__a, __b);
}
__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
  return __nv_vsub4(__a, __b);
}
__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
  return __nv_vsubss2(__a, __b);
}
__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
  return __nv_vsubss4(__a, __b);
}
__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
  return __nv_vsubus2(__a, __b);
}
__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
  return __nv_vsubus4(__a, __b);
}
#else // CUDA_VERSION >= 9020
// CUDA no longer provides inline assembly (or bitcode) implementation of these
// functions, so we have to reimplment them. The implementation is naive and is
// not optimized for performance.

// Helper function to convert N-bit boolean subfields into all-0 or all-1.
// E.g. __bool2mask(0x01000100,8) -> 0xff00ff00
//      __bool2mask(0x00010000,16) -> 0xffff0000
__DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) {
  return (__a << shift) - __a;
}
__DEVICE__ unsigned int __vabs2(unsigned int __a) {
  unsigned int r;
  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(0), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vabs4(unsigned int __a) {
  unsigned int r;
  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(0), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}

__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vabsss2(unsigned int __a) {
  unsigned int r;
  __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(0), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vabsss4(unsigned int __a) {
  unsigned int r;
  __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(0), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.u32.u32.eq %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vseteq2(__a, __b), 16);
}
__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.u32.u32.eq %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vseteq4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.s32.s32.ge %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetges2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.s32.s32.ge %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetges4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.u32.u32.ge %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetgeu2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.u32.u32.ge %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetgeu4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.s32.s32.gt %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetgts2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.s32.s32.gt %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetgts4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.u32.u32.gt %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetgtu2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.u32.u32.gt %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetgtu4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.s32.s32.le %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetles2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.s32.s32.le %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetles4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.u32.u32.le %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetleu2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.u32.u32.le %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetleu4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.s32.s32.lt %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetlts2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.s32.s32.lt %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetlts4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.u32.u32.lt %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetltu2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.u32.u32.lt %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetltu4(__a, __b), 8);
}
__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset2.u32.u32.ne %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetne2(__a, __b), 16);
}
__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vset4.u32.u32.ne %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
  return __bool2mask(__vsetne4(__a, __b), 8);
}

// Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086
// (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) =>
// (a + b) / 2 = ((a ^ b) >> 1) + (a & b)
// To operate on multiple sub-elements we need to make sure to mask out bits
// that crossed over into adjacent elements during the shift.
__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) {
  return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b);
}
__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) {
  return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b);
}

__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  if ((__a & 0x8000) && (__b & 0x8000)) {
    // Work around a bug in ptxas which produces invalid result if low element
    // is negative.
    unsigned mask = __vcmpgts2(__a, __b);
    r = (__a & mask) | (__b & ~mask);
  } else {
    __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;"
            : "=r"(r)
            : "r"(__a), "r"(__b), "r"(0));
  }
  return r;
}
__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}

__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); }

__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); }
__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vnegss2(unsigned int __a) {
  return __vsubss2(0, __a);
}
__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vsub4.s32.s32.s32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vnegss4(unsigned int __a) {
  return __vsubss4(0, __a);
}
__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vsub2.u32.u32.u32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
  unsigned int r;
  __asm__("vsub4.u32.u32.u32.sat %0,%1,%2,%3;"
          : "=r"(r)
          : "r"(__a), "r"(__b), "r"(0));
  return r;
}
#endif // CUDA_VERSION >= 9020
#endif // 0
/****************************************************************************/

// For OpenMP we require the user to include <time.h> as we need to know what
// clock_t is on the system.
// #ifndef __OPENMP_NVPTX__
// __DEVICE__ /* clock_t= */ long long int clock() { return __clock(); }
// #endif
// __DEVICE__ long long int clock64() { return __clock64(); }

/********************************************
*   code from device_functions_internal.h   *
********************************************/
__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
    if (flags) {
        __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
        __builtin_amdgcn_s_barrier();
        __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
    } else {
        __builtin_amdgcn_s_barrier();
    }
}

__device__
inline
static void __barrier(int n)
{
  __work_group_barrier((__cl_mem_fence_flags)n);
}

__device__
inline
__attribute__((convergent))
void __syncthreads()
{
  __barrier(__CLK_LOCAL_MEM_FENCE);
}

extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
// Clock functions
__device__ long long int __clock64();
__device__ long long int __clock();
__device__ long long int clock64();
__device__ long long int clock();
__device__ long long int wall_clock64();
// hip.amdgcn.bc - named sync
__device__ void __named_sync(int a, int b);

#ifdef __CUDA_ARCH__

// Clock function to return GPU core cycle count.
// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
// through hipDeviceAttributeClockRate attribute.
__device__
inline  __attribute((always_inline))
long long int __clock64() {
#if __has_builtin(__builtin_amdgcn_s_memtime)
  // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
  return (long long int) __builtin_amdgcn_s_memtime();
#else
  // Subject to change when better solution available
  return (long long int) __builtin_readcyclecounter();
#endif
}

__device__
inline __attribute((always_inline))
long long int  __clock() { return __clock64(); }

// Clock function to return wall clock count at a constant frequency. The interface to query
// the frequency will be implemented.
__device__
inline  __attribute__((always_inline))
long long int wall_clock64() {
  return (long long int) __ockl_steadyctr_u64();
}

__device__
inline  __attribute__((always_inline))
long long int clock64() { return __clock64(); }

__device__
inline __attribute__((always_inline))
long long int  clock() { return __clock(); }

// hip.amdgcn.bc - named sync
__device__
inline
void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }

#endif // __CUDA_ARCH__

// These functions shouldn't be declared when including this header
// for math function resolution purposes.
// loop unrolling
static inline __device__ void* __hip_memcpy(void* dst, const void* src, size_t size) {
    auto dstPtr = static_cast<unsigned char*>(dst);
    auto srcPtr = static_cast<const unsigned char*>(src);

    while (size >= 4u) {
        dstPtr[0] = srcPtr[0];
        dstPtr[1] = srcPtr[1];
        dstPtr[2] = srcPtr[2];
        dstPtr[3] = srcPtr[3];

        size -= 4u;
        srcPtr += 4u;
        dstPtr += 4u;
    }
    switch (size) {
        case 3:
            dstPtr[2] = srcPtr[2];
        case 2:
            dstPtr[1] = srcPtr[1];
        case 1:
            dstPtr[0] = srcPtr[0];
    }

    return dst;
}

static inline __device__ void* __hip_memset(void* dst, unsigned char val, size_t size) {
    auto dstPtr = static_cast<unsigned char*>(dst);

    while (size >= 4u) {
        dstPtr[0] = val;
        dstPtr[1] = val;
        dstPtr[2] = val;
        dstPtr[3] = val;

        size -= 4u;
        dstPtr += 4u;
    }
    switch (size) {
        case 3:
            dstPtr[2] = val;
        case 2:
            dstPtr[1] = val;
        case 1:
            dstPtr[0] = val;
    }

    return dst;
}

#ifndef __OPENMP_NVPTX__
__DEVICE__ void *memcpy(void* dst, const void* src, size_t size) {
    return __hip_memcpy(dst, src, size);
}
__DEVICE__ void *memset(void* ptr, int val, size_t size) {
    unsigned char val8 = static_cast<unsigned char>(val);
    return __hip_memset(ptr, val8, size);
}
#endif

#pragma pop_macro("__DEVICE__")
#endif // __CLANG__CUDAMOCKER_DEVICE_FUNCTIONS_H__
