/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */

/*********************************************************************************************************************************
*                                                                                                                                *
*   split atomic functions from `__clang_cudamocker_device_functions.h` and put them in `__clang_cudamocker_atomic_functions.h`  *
*                                                                                                                                *
*********************************************************************************************************************************/



#ifndef __CLANG__CUDAMOCKER_ATOMIC_FUNCTIONS_H__
#define __CLANG__CUDAMOCKER_ATOMIC_FUNCTIONS_H__

#ifndef __OPENMP_NVPTX__
#if CUDA_VERSION < 9000
#error This file is intended to be used with CUDA-9+ only.
#endif
#endif

// __DEVICE__ is a helper macro with common set of attributes for the wrappers
// we implement in this file. We need static in order to avoid emitting unused
// functions and __forceinline__ helps inlining these wrappers at -O1.
#pragma push_macro("__DEVICE__")
#ifdef __OPENMP_NVPTX__
#define __DEVICE__ static __attribute__((always_inline, nothrow))
#else
#define __DEVICE__ static __device__ __forceinline__
#endif

/****************************************************
*   code from device_atomic_functions_internal.h    *
****************************************************/

template<bool B, typename T, typename F> struct Cond_t;

template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };

// Now clang frontend does not introduce these macros by the default, 
// so it is defined here, and it is visible to the host and device mode.
#define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
#define __HIP_MEMORY_SCOPE_WAVEFRONT 2
#define __HIP_MEMORY_SCOPE_WORKGROUP 3
#define __HIP_MEMORY_SCOPE_AGENT 4
#define __HIP_MEMORY_SCOPE_SYSTEM 5

// Atomic expanders
template<
  int mem_order = __ATOMIC_SEQ_CST,
  int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
  typename T,
  typename Op,
  typename F>
inline
__attribute__((always_inline, device))
T hip_cas_expander(T* p, T x, Op op, F f) noexcept
{
  using FP = __attribute__((address_space(0))) const void*;

  __device__
  extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");

  if (is_shared_workaround((FP)p))
    return f();

  using U = typename Cond_t<
    sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;

  auto q = reinterpret_cast<U*>(p);

  U tmp0{__hip_atomic_load(q, mem_order, mem_scope)};
  U tmp1;
  do {
    tmp1 = tmp0;

    op(reinterpret_cast<T&>(tmp1), x);
  } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
                                                 mem_order, mem_scope));

  return reinterpret_cast<const T&>(tmp0);
}

template<
  int mem_order = __ATOMIC_SEQ_CST,
  int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
  typename T,
  typename Cmp,
  typename F>
inline
__attribute__((always_inline, device))
T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
{
  using FP = __attribute__((address_space(0))) const void*;

  __device__
  extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");

  if (is_shared_workaround((FP)p))
    return f();

  using U = typename Cond_t<
    sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;

  auto q = reinterpret_cast<U*>(p);

  U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
  while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
         !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
                                               mem_scope));

  return reinterpret_cast<const T&>(tmp);
}


/**********************************************************************************************
*   HIP is not support `__*Atomic*_block` now, so can not use hip bulitin function to replace *
**********************************************************************************************/

__DEVICE__ double __dAtomicAdd(double* address, double val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ double __dAtomicAdd_block(double *__p, double __v) {
//   return __nvvm_atom_cta_add_gen_d(__p, __v);
// }
__DEVICE__ double __dAtomicAdd_system(double* address, double val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ float __fAtomicAdd(float* address, float val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ float __fAtomicAdd_block(float *__p, float __v) {
//   return __nvvm_atom_cta_add_gen_f(__p, __v);
// }
__DEVICE__ float __fAtomicAdd_system(float* address, float val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ float __fAtomicExch(float* address, float val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ float __fAtomicExch_block(float *__p, float __v) {
//   return __nv_int_as_float(
//       __nvvm_atom_cta_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
// }
__DEVICE__ float __fAtomicExch_system(float* address, float val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ int __iAtomicAdd(int* address, int val) {
    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ int __iAtomicAdd_block(int *__p, int __v) {
//   return __nvvm_atom_cta_add_gen_i(__p, __v);
// }
__DEVICE__ int __iAtomicAdd_system(int* address, int val) {
    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ int __iAtomicAnd(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int& x, int y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ int __iAtomicAnd_block(int *__p, int __v) {
//   return __nvvm_atom_cta_and_gen_i(__p, __v);
// }
__DEVICE__ int __iAtomicAnd_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int& x, int y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ int __iAtomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                        __HIP_MEMORY_SCOPE_AGENT);
    return compare;
}
// __DEVICE__ int __iAtomicCAS_block(int *__p, int __cmp, int __v) {
//   return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v);
// }
__DEVICE__ int __iAtomicCAS_system(int* address, int compare, int val) {
    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                        __HIP_MEMORY_SCOPE_SYSTEM);
    return compare;
}

__DEVICE__ int __iAtomicExch(int* address, int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ int __iAtomicExch_block(int *__p, int __v) {
//   return __nvvm_atom_cta_xchg_gen_i(__p, __v);
// }
__DEVICE__ int __iAtomicExch_system(int* address, int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ int __iAtomicMax(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int x, int y) { return y < x; }, [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ int __iAtomicMax_block(int *__p, int __v) {
//   return __nvvm_atom_cta_max_gen_i(__p, __v);
// }
__DEVICE__ int __iAtomicMax_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int x, int y) { return y < x; }, [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ int __iAtomicMin(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int x, int y) { return x < y; }, [=]() {
      return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ int __iAtomicMin_block(int *__p, int __v) {
//   return __nvvm_atom_cta_min_gen_i(__p, __v);
// }
__DEVICE__ int __iAtomicMin_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int x, int y) { return x < y; }, [=]() {
      return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ int __iAtomicOr(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int& x, int y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ int __iAtomicOr_block(int *__p, int __v) {
//   return __nvvm_atom_cta_or_gen_i(__p, __v);
// }
__DEVICE__ int __iAtomicOr_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int& x, int y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ int __iAtomicXor(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int& x, int y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ int __iAtomicXor_block(int *__p, int __v) {
//   return __nvvm_atom_cta_xor_gen_i(__p, __v);
// }
__DEVICE__ int __iAtomicXor_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int& x, int y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

/**************************************************************************
*    HIP is not support `long long` arguments for atomic functions now    *
**************************************************************************/
// __DEVICE__ long long __illAtomicMax(long long *__p, long long __v) {
//   return __nvvm_atom_max_gen_ll(__p, __v);
// }
// __DEVICE__ long long __illAtomicMax_block(long long *__p, long long __v) {
//   return __nvvm_atom_cta_max_gen_ll(__p, __v);
// }
// __DEVICE__ long long __illAtomicMax_system(long long *__p, long long __v) {
//   return __nvvm_atom_sys_max_gen_ll(__p, __v);
// }
// __DEVICE__ long long __illAtomicMin(long long *__p, long long __v) {
//   return __nvvm_atom_min_gen_ll(__p, __v);
// }
// __DEVICE__ long long __illAtomicMin_block(long long *__p, long long __v) {
//   return __nvvm_atom_cta_min_gen_ll(__p, __v);
// }
// __DEVICE__ long long __illAtomicMin_system(long long *__p, long long __v) {
//   return __nvvm_atom_sys_min_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicAnd(long long *__p, long long __v) {
//   return __nvvm_atom_and_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicAnd_block(long long *__p, long long __v) {
//   return __nvvm_atom_cta_and_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicAnd_system(long long *__p, long long __v) {
//   return __nvvm_atom_sys_and_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicOr(long long *__p, long long __v) {
//   return __nvvm_atom_or_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicOr_block(long long *__p, long long __v) {
//   return __nvvm_atom_cta_or_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicOr_system(long long *__p, long long __v) {
//   return __nvvm_atom_sys_or_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicXor(long long *__p, long long __v) {
//   return __nvvm_atom_xor_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicXor_block(long long *__p, long long __v) {
//   return __nvvm_atom_cta_xor_gen_ll(__p, __v);
// }
// __DEVICE__ long long __llAtomicXor_system(long long *__p, long long __v) {
//   return __nvvm_atom_sys_xor_gen_ll(__p, __v);
// }

__DEVICE__ unsigned int __uAtomicAdd(unsigned int* address, unsigned int val) {
    return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ unsigned int __uAtomicAdd_block(unsigned int *__p,
//                                            unsigned int __v) {
//   return __nvvm_atom_cta_add_gen_i((int *)__p, __v);
// }
__DEVICE__ unsigned int __uAtomicAdd_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ unsigned int __uAtomicAnd(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned int __uAtomicAnd_block(unsigned int *__p,
//                                            unsigned int __v) {
//   return __nvvm_atom_cta_and_gen_i((int *)__p, __v);
// }
__DEVICE__ unsigned int __uAtomicAnd_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned int __uAtomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                        __HIP_MEMORY_SCOPE_AGENT);
    return compare;
}
// __DEVICE__ unsigned int
// __uAtomicCAS_block(unsigned int *__p, unsigned int __cmp, unsigned int __v) {
//   return __nvvm_atom_cta_cas_gen_i((int *)__p, __cmp, __v);
// }
__DEVICE__ unsigned int __uAtomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
    __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                        __HIP_MEMORY_SCOPE_SYSTEM);
    return compare;
}

__DEVICE__ unsigned int __uAtomicDec(unsigned int* address, unsigned int val)
{
#if defined(__gfx941__)
  __device__
  extern
  unsigned int __builtin_amdgcn_atomic_dec(
    unsigned int*,
    unsigned int,
    unsigned int,
    unsigned int,
    bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");

  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); },
    [=]() {
    return
      __builtin_amdgcn_atomic_dec(address, val, __ATOMIC_RELAXED, 1, false);
  });
#else
  return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
#endif // __gfx941__
}
// __DEVICE__ unsigned int __uAtomicDec_block(unsigned int *__p,
//                                            unsigned int __v) {
//   return __nvvm_atom_cta_dec_gen_ui(__p, __v);
// }
// __DEVICE__ unsigned int __uAtomicDec_system(unsigned int *__p,
//                                             unsigned int __v) {
//   return __nvvm_atom_sys_dec_gen_ui(__p, __v);
// }

__DEVICE__ unsigned int __uAtomicExch(unsigned int* address, unsigned int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ unsigned int __uAtomicExch_block(unsigned int *__p,
//                                             unsigned int __v) {
//   return __nvvm_atom_cta_xchg_gen_i((int *)__p, __v);
// }
__DEVICE__ unsigned int __uAtomicExch_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ unsigned int __uAtomicInc(unsigned int* address, unsigned int val)
{
#if defined(__gfx941__)
  __device__
  extern
  unsigned int __builtin_amdgcn_atomic_inc(
    unsigned int*,
    unsigned int,
    unsigned int,
    unsigned int,
    bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");

  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); },
    [=]() {
    return
      __builtin_amdgcn_atomic_inc(address, val, __ATOMIC_RELAXED, 1, false);
  });
#else
    return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
#endif // __gfx941__
}
// __DEVICE__ unsigned int __uAtomicInc_block(unsigned int *__p,
//                                            unsigned int __v) {
//   return __nvvm_atom_cta_inc_gen_ui(__p, __v);
// }
// __DEVICE__ unsigned int __uAtomicInc_system(unsigned int *__p,
//                                             unsigned int __v) {
//   return __nvvm_atom_sys_inc_gen_ui(__p, __v);
// }

__DEVICE__ unsigned int __uAtomicMax(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned int __uAtomicMax_block(unsigned int *__p,
//                                            unsigned int __v) {
//   return __nvvm_atom_cta_max_gen_ui(__p, __v);
// }
__DEVICE__ unsigned int __uAtomicMax_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned int __uAtomicMin(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
      return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned int __uAtomicMin_block(unsigned int *__p,
//                                            unsigned int __v) {
//   return __nvvm_atom_cta_min_gen_ui(__p, __v);
// }
__DEVICE__ unsigned int __uAtomicMin_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
      return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned int __uAtomicOr(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned int __uAtomicOr_block(unsigned int *__p, unsigned int __v) {
//   return __nvvm_atom_cta_or_gen_i((int *)__p, __v);
// }
__DEVICE__ unsigned int __uAtomicOr_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned int __uAtomicXor(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned int __uAtomicXor_block(unsigned int *__p,
//                                            unsigned int __v) {
//   return __nvvm_atom_cta_xor_gen_i((int *)__p, __v);
// }
__DEVICE__ unsigned int __uAtomicXor_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p,
//                                                    unsigned long long __v) {
//   return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v);
// }
__DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x &= y; },
    [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long *__p,
//                                                    unsigned long long __v) {
//   return __nvvm_atom_cta_and_gen_ll((long long *)__p, __v);
// }
__DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x &= y; },
    [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long* address, unsigned long long compare,
                             unsigned long long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
}
// __DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p,
//                                                    unsigned long long __cmp,
//                                                    unsigned long long __v) {
//   return __nvvm_atom_cta_cas_gen_ll((long long *)__p, __cmp, __v);
// }
__DEVICE__ unsigned long long __ullAtomicCAS_system(unsigned long long* address, unsigned long long compare,
                                    unsigned long long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
}

__DEVICE__ unsigned long long __ullAtomicExch(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
// __DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p,
//                                                     unsigned long long __v) {
//   return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v);
// }
__DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__DEVICE__ unsigned long long __ullAtomicMax(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long x, unsigned long long y) { return y < x; },
    [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long *__p,
//                                                    unsigned long long __v) {
//   return __nvvm_atom_cta_max_gen_ull(__p, __v);
// }
__DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long long x, unsigned long long y) { return y < x; },
    [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned long long __ullAtomicMin(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long x, unsigned long long y) { return x < y; },
    [=]() {
    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long *__p,
//                                                    unsigned long long __v) {
//   return __nvvm_atom_cta_min_gen_ull(__p, __v);
// }
__DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long long x, unsigned long long y) { return x < y; },
    [=]() {
    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned long long __ullAtomicOr(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x |= y; },
    [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long *__p,
//                                                   unsigned long long __v) {
//   return __nvvm_atom_cta_or_gen_ll((long long *)__p, __v);
// }
__DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x |= y; },
    [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__DEVICE__ unsigned long long __ullAtomicXor(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x ^= y; },
    [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}
// __DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long *__p,
//                                                    unsigned long long __v) {
//   return __nvvm_atom_cta_xor_gen_ll((long long *)__p, __v);
// }
__DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

#pragma pop_macro("__DEVICE__")
#endif // __CLANG__CUDAMOCKER_ATOMIC_FUNCTIONS_H__
