/*
  Here are some hip functions that still need to be preserved, as well as cuda_hip conflicting functions.
*/

#ifndef __CLANG_CUDAMOCKER_HIP_H__
#define __CLANG_CUDAMOCKER_HIP_H__

__device__
unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");

__device__
int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle");

__device__
inline
unsigned long long int __ballot64(int predicate) {
    return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
}

/**
 * @brief HIP ATOMIC UNIQUE
 * 
 */
__device__
inline
void atomicAddNoRet(float* address, float val)
{
    __ockl_atomic_add_noret_f32(address, val);
}

// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
// If not found, return -1.
__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
    return input == 0 ? -1 : __builtin_ctzl(input);
}

__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
    uint32_t offset = src1 & 31;
    uint32_t width = src2 & 31;
    return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
}

__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
    uint64_t offset = src1 & 63;
    uint64_t width = src2 & 63;
    return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
}

__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
    uint32_t offset = src2 & 31;
    uint32_t width = src3 & 31;
    uint32_t mask = (1 << width) - 1;
    return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
}

__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
    uint64_t offset = src2 & 63;
    uint64_t width = src3 & 63;
    uint64_t mask = (1ULL << width) - 1;
    return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
}

__device__
static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};

__device__
static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};

/*
HIP specific device functions
*/

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-identifier"
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#pragma clang diagnostic ignored "-Wsign-conversion"
#pragma clang diagnostic ignored "-Wold-style-cast"
#pragma clang diagnostic ignored "-Wc++98-compat"
#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
#endif

__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
    return tmp.u;
}

__device__ static inline float __hip_ds_bpermutef(int index, float src) {
    union { int i; unsigned u; float f; } tmp; tmp.f = src;
    tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
    return tmp.f;
}

__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
    return tmp.u;
}

__device__ static inline float __hip_ds_permutef(int index, float src) {
    union { int i; unsigned u; float f; } tmp; tmp.f = src;
    tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
    return tmp.f;
}

#define __hip_ds_swizzle(src, pattern)  __hip_ds_swizzle_N<(pattern)>((src))
#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))

template <int pattern>
__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
    return tmp.u;
}

template <int pattern>
__device__ static inline float __hip_ds_swizzlef_N(float src) {
    union { int i; unsigned u; float f; } tmp; tmp.f = src;
    tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
    return tmp.f;
}

#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))

template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
__device__ static inline int __hip_move_dpp_N(int src) {
    return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
                                    bound_ctrl);
}

#if defined(__clang__)
#pragma clang diagnostic pop
#endif


#define MASK1 0x00ff00ff
#define MASK2 0xff00ff00

__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
    out.w = (one1 + one2) & MASK1;
    one1 = in1.w & MASK2;
    one2 = in2.w & MASK2;
    out.w = out.w | ((one1 + one2) & MASK2);
    return out;
}

__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
    out.w = (one1 - one2) & MASK1;
    one1 = in1.w & MASK2;
    one2 = in2.w & MASK2;
    out.w = out.w | ((one1 - one2) & MASK2);
    return out;
}

__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
    out.w = (one1 * one2) & MASK1;
    one1 = in1.w & MASK2;
    one2 = in2.w & MASK2;
    out.w = out.w | ((one1 * one2) & MASK2);
    return out;
}

// hip.amdgcn.bc - lanemask
__device__
inline
uint64_t  __lanemask_gt()
{
    uint32_t lane = __ockl_lane_u32();
    if (lane == 63)
      return 0;
    uint64_t ballot = __ballot64(1);
    uint64_t mask = (~((uint64_t)0)) << (lane + 1);
    return mask & ballot;
}

__device__
inline
uint64_t __lanemask_lt()
{
    uint32_t lane = __ockl_lane_u32();
    int64_t ballot = __ballot64(1);
    uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
    return mask & ballot;
}

__device__
inline
uint64_t  __lanemask_eq()
{
    uint32_t lane = __ockl_lane_u32();
    int64_t mask = ((uint64_t)1 << lane);
    return mask;
}

__device__ inline void* __local_to_generic(void* p) { return p; }

#ifdef __CUDA_ARCH__
__device__
inline
void* __get_dynamicgroupbaseptr()
{
    // Get group segment base pointer.
    return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
}
#else
__device__
void* __get_dynamicgroupbaseptr();
#endif // __CUDA_ARCH__

__device__
inline
void *__amdgcn_get_dynamicgroupbaseptr() {
    return __get_dynamicgroupbaseptr();
}

// abort
__device__
inline
__attribute__((weak))
void abort() {
    return __builtin_trap();
}


// hip.amdgcn.bc - device routine
/*
  HW_ID Register bit structure for RDNA2 & RDNA3
  WAVE_ID     4:0     Wave id within the SIMD.
  SIMD_ID     9:8     SIMD_ID within the WGP: [0] = row, [1] = column.
  WGP_ID      13:10   Physical WGP ID.
  SA_ID       16      Shader Array ID
  SE_ID       20:18   Shader Engine the wave is assigned to for gfx11
  SE_ID       19:18   Shader Engine the wave is assigned to for gfx10
  DP_RATE     31:29   Number of double-precision float units per SIMD

  HW_ID Register bit structure for GCN and CDNA
  WAVE_ID     3:0     Wave buffer slot number. 0-9.
  SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
  PIPE_ID     7:6     Pipeline from which the wave was dispatched.
  CU_ID       11:8    Compute Unit the wave is assigned to.
  SH_ID       12      Shader Array (within an SE) the wave is assigned to.
  SE_ID       15:13   Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940
              14:13   Shader Engine the wave is assigned to for Vega.
  TG_ID       19:16   Thread-group ID
  VM_ID       23:20   Virtual Memory ID
  QUEUE_ID    26:24   Queue from which this wave was dispatched.
  STATE_ID    29:27   State ID (graphics only, not compute).
  ME_ID       31:30   Micro-engine ID.

  XCC_ID Register bit structure for gfx940
  XCC_ID      3:0     XCC the wave is assigned to.
 */

#if (defined (__GFX10__) || defined (__GFX11__))
  #define HW_ID               23
#else
  #define HW_ID               4
#endif

#if (defined(__GFX10__) || defined(__GFX11__))
  #define HW_ID_WGP_ID_SIZE   4
  #define HW_ID_WGP_ID_OFFSET 10
#else
  #define HW_ID_CU_ID_SIZE    4
  #define HW_ID_CU_ID_OFFSET  8
#endif

#if (defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__GFX11__))
  #define HW_ID_SE_ID_SIZE    3
#else //4 SEs/XCC for gfx940
  #define HW_ID_SE_ID_SIZE    2
#endif
#if (defined(__GFX10__) || defined(__GFX11__))
  #define HW_ID_SE_ID_OFFSET  18
  #define HW_ID_SA_ID_OFFSET  16
  #define HW_ID_SA_ID_SIZE    1
#else
  #define HW_ID_SE_ID_OFFSET  13
#endif

#if (defined(__gfx940__))
  #define XCC_ID                   20
  #define XCC_ID_XCC_ID_SIZE       4
  #define XCC_ID_XCC_ID_OFFSET     0
#endif

#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
    (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
  #define __HIP_NO_IMAGE_SUPPORT   1
#endif

/*
   Encoding of parameter bitmask
   HW_ID        5:0     HW_ID
   OFFSET       10:6    Range: 0..31
   SIZE         15:11   Range: 1..32
 */

#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))

/*
  __smid returns the wave's assigned Compute Unit and Shader Engine.
  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
  Note: the results vary over time.
  SZ minus 1 since SIZE is 1-based.
*/
__device__
inline
unsigned __smid(void)
{
    unsigned se_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
    #if (defined(__GFX10__) || defined(__GFX11__))
      unsigned wgp_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
      unsigned sa_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
    #else
      #if defined(__gfx940__)
      unsigned xcc_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
      #endif
      unsigned cu_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
    #endif
    #if (defined(__GFX10__) || defined(__GFX11__))
      unsigned temp = se_id;
      temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
      temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
      return temp;
      //TODO : CU Mode impl
    #elif defined(__gfx940__)
      unsigned temp = xcc_id;
      temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
      temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
      return temp;
    #else
      return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
    #endif
}

/**
 * @brief 
 * CUDA special type function implementation 
 * Features that opencl cannot support, need to be directly implemented in the header file.
 * PS: Although part of the source code can be directly modified in math_functions.hpp, 
 *     in order to reduce the code modification of the header file, 
 *     it is moved here for implementation.
 * 
 */

// opencl does not support the implementation syntax of nan and nanf.
#pragma push_macro("uint32_t")
#pragma push_macro("uint64_t")
#define uint32_t __UINT32_TYPE__
#define uint64_t __UINT64_TYPE__
__device__
inline
uint64_t __make_mantissa_base8(const char* tagp)
{
    uint64_t r = 0;
    while (tagp) {
        char tmp = *tagp;

        if (tmp >= '0' && tmp <= '7') r = (r * 8u) + tmp - '0';
        else return 0;

        ++tagp;
    }

    return r;
}

__device__
inline
uint64_t __make_mantissa_base10(const char* tagp)
{
    uint64_t r = 0;
    while (tagp) {
        char tmp = *tagp;

        if (tmp >= '0' && tmp <= '9') r = (r * 10u) + tmp - '0';
        else return 0;

        ++tagp;
    }

    return r;
}

__device__
inline
uint64_t __make_mantissa_base16(const char* tagp)
{
    uint64_t r = 0;
    while (tagp) {
        char tmp = *tagp;

        if (tmp >= '0' && tmp <= '9') r = (r * 16u) + tmp - '0';
        else if (tmp >= 'a' && tmp <= 'f') r = (r * 16u) + tmp - 'a' + 10;
        else if (tmp >= 'A' && tmp <= 'F') r = (r * 16u) + tmp - 'A' + 10;
        else return 0;

        ++tagp;
    }

    return r;
}

__device__
inline
uint64_t __make_mantissa(const char* tagp)
{
    if (!tagp) return 0u;

    if (*tagp == '0') {
        ++tagp;

        if (*tagp == 'x' || *tagp == 'X') return __make_mantissa_base16(tagp);
        else return __make_mantissa_base8(tagp);
    }

    return __make_mantissa_base10(tagp);
}

__device__
inline
float nanf(const char* tagp)
{
    union {
        float val;
        struct ieee_float {
            uint32_t mantissa : 22;
            uint32_t quiet : 1;
            uint32_t exponent : 8;
            uint32_t sign : 1;
        } bits;

        static_assert(sizeof(float) == sizeof(ieee_float), "");
    } tmp;

    tmp.bits.sign = 0u;
    tmp.bits.exponent = ~0u;
    tmp.bits.quiet = 1u;
    tmp.bits.mantissa = __make_mantissa(tagp);

    return tmp.val;
}

__device__
inline
double nan(const char* tagp)
{
#if !_WIN32
    union {
        double val;
        struct ieee_double {
            uint64_t mantissa : 51;
            uint32_t quiet : 1;
            uint32_t exponent : 11;
            uint32_t sign : 1;
        }  bits;
        static_assert(sizeof(double) == sizeof(ieee_double), "");
    } tmp;

    tmp.bits.sign = 0u;
    tmp.bits.exponent = ~0u;
    tmp.bits.quiet = 1u;
    tmp.bits.mantissa = __make_mantissa(tagp);

    return tmp.val;
#else
    static_assert(sizeof(uint64_t)==sizeof(double));
    uint64_t val = __make_mantissa(tagp);
    val |= 0xFFF << 51;
    return *reinterpret_cast<double*>(&val);
#endif
}
#pragma pop_macro("uint32_t")
#pragma pop_macro("uint64_t")


// temporarily supported some math functions
__device__
inline _Float16 fma(_Float16 x, _Float16 y, _Float16 z) {
    return __ocml_fma_f16(x, y, z);
}

__device__
inline _Float16 pow(_Float16 base, int iexp) {
  return __ocml_pown_f16(base, iexp);
}


/**
 * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
 * To be removed in a future release.
 */
#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
#define HIP_DYNAMIC_SHARED_ATTRIBUTE


#endif  // __CLANG_CUDAMOCKER_HIP_H__
