/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 * Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 * Notified per clause 4(b) of the license.
 *
 *===-----------------------------------------------------------------------===
 */
#ifndef __CLANG_CUDAMOCKER_MATH_H__
#define __CLANG_CUDAMOCKER_MATH_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif

#ifndef __OPENMP_NVPTX__
#if CUDA_VERSION < 9000
#error This file is intended to be used with CUDA-9+ only.
#endif
#endif

// __DEVICE__ is a helper macro with common set of attributes for the wrappers
// we implement in this file. We need static in order to avoid emitting unused
// functions and __forceinline__ helps inlining these wrappers at -O1.
#pragma push_macro("__DEVICE__")
#ifdef __OPENMP_NVPTX__
#if defined(__cplusplus)
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
#else
// Use __BUILD_MATH_BUILTINS_LIB__ to build device specific libm-nvptx.bc
// for FORTRAN bitcode linking since FORTRAN cannot use c headers.
#ifdef __BUILD_MATH_BUILTINS_LIB__
#include <limits.h>
#define HUGE_VALF (__builtin_huge_valf())
#define HUGE_VAL (__builtin_huge_val())
#define __DEVICE__ extern __attribute__((always_inline, nothrow, cold, weak))
#else
#define __DEVICE__ static __attribute__((always_inline, nothrow))
#endif // __BUILD_MATH_BUILTINS_LIB__
#endif // __cplusplus
#else
// CUDA Clang
#define __DEVICE__ static __device__ __forceinline__
#endif

// Specialized version of __DEVICE__ for functions with void return type. Needed
// because the OpenMP overlay requires constexpr functions here but prior to
// c++14 void return functions could not be constexpr.
#pragma push_macro("__DEVICE_VOID__")
#if defined(__OPENMP_NVPTX__) && defined(__cplusplus) && __cplusplus < 201402L
#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
#else
#define __DEVICE_VOID__ __DEVICE__
#endif

// libdevice provides fast low precision and slow full-recision implementations
// for some functions. Which one gets selected depends on
// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
// -ffast-math or -fcuda-approx-transcendentals are in effect.
#pragma push_macro("__FAST_OR_SLOW")
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
#define __FAST_OR_SLOW(fast, slow) fast
#else
#define __FAST_OR_SLOW(fast, slow) slow
#endif

__DEVICE__
int abs(int __x) {
  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
  return (__x ^ __sgn) - __sgn;
}
__DEVICE__ double fabs(double __x) { return __ocml_fabs_f64(__x); }
__DEVICE__ double acos(double __x) { return __ocml_acos_f64(__x); }
__DEVICE__ float acosf(float __x) { return __ocml_acos_f32(__x); }
__DEVICE__ double acosh(double __x) { return __ocml_acosh_f64(__x); }
__DEVICE__ float acoshf(float __x) { return __ocml_acosh_f32(__x); }
__DEVICE__ double asin(double __x) { return __ocml_asin_f64(__x); }
__DEVICE__ float asinf(float __x) { return __ocml_asin_f32(__x); }
__DEVICE__ double asinh(double __x) { return __ocml_asinh_f64(__x); }
__DEVICE__ float asinhf(float __x) { return __ocml_asinh_f32(__x); }
__DEVICE__ double atan(double __x) { return __ocml_atan_f64(__x); }
__DEVICE__ double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); }
__DEVICE__ float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
__DEVICE__ float atanf(float __x) { return __ocml_atan_f32(__x); }
__DEVICE__ double atanh(double __x) { return __ocml_atanh_f64(__x); }
__DEVICE__ float atanhf(float __x) { return __ocml_atanh_f32(__x); }
__DEVICE__ double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
__DEVICE__ float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
__DEVICE__ double ceil(double __x) { return __ocml_ceil_f64(__x); }
__DEVICE__ float ceilf(float __x) { return __ocml_ceil_f32(__x); }
__DEVICE__ double copysign(double __x, double __y) {
  return __ocml_copysign_f64(__x, __y);
}
__DEVICE__ float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); }
__DEVICE__ double cos(double __x) { return __ocml_cos_f64(__x); }
__DEVICE__ float cosf(float x) {
#ifdef __FAST_MATH__
  return __cosf(x);
#else
  return __ocml_cos_f32(x);
#endif
}
__DEVICE__ double cosh(double __x) { return __ocml_cosh_f64(__x); }
__DEVICE__ float coshf(float __x) { return __ocml_cosh_f32(__x); }
__DEVICE__ double cospi(double __x) { return __ocml_cospi_f64(__x); }
__DEVICE__ float cospif(float __x) { return __ocml_cospi_f32(__x); }
__DEVICE__ double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
__DEVICE__ float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
__DEVICE__ double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
__DEVICE__ float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
__DEVICE__ double erf(double __x) { return __ocml_erf_f64(__x); }
__DEVICE__ double erfc(double __x) { return __ocml_erfc_f64(__x); }
__DEVICE__ float erfcf(float __x) { return __ocml_erfc_f32(__x); }
__DEVICE__ double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
__DEVICE__ float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
__DEVICE__ double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
__DEVICE__ float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
__DEVICE__ float erff(float __x) { return __ocml_erf_f32(__x); }
__DEVICE__ double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
__DEVICE__ float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
__DEVICE__ double exp(double __x) { return __ocml_exp_f64(__x); }
__DEVICE__ double exp10(double __x) { return __ocml_exp10_f64(__x); }
__DEVICE__ float exp10f(float x) {
#ifdef __FAST_MATH__
  return __exp10f(x);
#else
  return __ocml_exp10_f32(x);
#endif
}
__DEVICE__ double exp2(double __x) { return __ocml_exp2_f64(__x); }
__DEVICE__ float exp2f(float __x) { return __ocml_exp2_f32(__x); }
__DEVICE__ float expf(float x) {
#ifdef __FAST_MATH__
  return __expf(x);
#else
  return __ocml_exp_f32(x);
#endif
}
__DEVICE__ double expm1(double __x) { return __ocml_expm1_f64(__x); }
__DEVICE__ float expm1f(float __x) { return __ocml_expm1_f32(__x); }
__DEVICE__ float fabsf(float __x) { return __ocml_fabs_f32(__x); }
__DEVICE__ double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
__DEVICE__ float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
__DEVICE__ float fdividef(float __a, float __b) {
#if __FAST_MATH__ && !__CUDA_PREC_DIV
  // return __nv_fast_fdividef(__a, __b);
  return __a / __b;
#else
  return __a / __b;
#endif
}
__DEVICE__ double floor(double __x) { return __ocml_floor_f64(__x); }
__DEVICE__ float floorf(float __x) { return __ocml_floor_f32(__x); }
__DEVICE__ double fma(double __x, double __y, double __z) {
  return __ocml_fma_f64(__x, __y, __z);
}

__DEVICE__ float fmaf(float __x, float __y, float __z) {
  return __ocml_fma_f32(__x, __y, __z);
}

__DEVICE__ double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
__DEVICE__ float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
__DEVICE__ double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
__DEVICE__ float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
__DEVICE__ double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
__DEVICE__ float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
__DEVICE__ double frexp(double __x, int *__nptr) {
  int __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  double __r =
      __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
  *__nptr = __tmp;
  return __r;
}

__DEVICE__ float frexpf(float __x, int *__nptr) {
  int __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  float __r =
      __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
  *__nptr = __tmp;

  return __r;
}

__DEVICE__ double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); }
__DEVICE__ float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
__DEVICE__ int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
__DEVICE__ int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
__DEVICE__ double j0(double __x) { return __ocml_j0_f64(__x); }
__DEVICE__ float j0f(float __x) { return __ocml_j0_f32(__x); }
__DEVICE__ double j1(double __x) { return __ocml_j1_f64(__x); }
__DEVICE__ float j1f(float __x) { return __ocml_j1_f32(__x); }
__DEVICE__ double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication
                                 // and the Miller & Brown algorithm
  //       for linear recurrences to get O(log n) steps, but it's unclear if
  //       it'd be beneficial in this case. Placeholder until OCML adds
  //       support.
  if (__n == 0)
    return j0(__x);
  if (__n == 1)
    return j1(__x);

  double __x0 = j0(__x);
  double __x1 = j1(__x);
  for (int __i = 1; __i < __n; ++__i) {
    double __x2 = (2 * __i) / __x * __x1 - __x0;
    __x0 = __x1;
    __x1 = __x2;
  }
  return __x1;
}
__DEVICE__ float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication
                                // and the Miller & Brown algorithm
  //       for linear recurrences to get O(log n) steps, but it's unclear if
  //       it'd be beneficial in this case.
  if (__n == 0)
    return j0f(__x);
  if (__n == 1)
    return j1f(__x);

  float __x0 = j0f(__x);
  float __x1 = j1f(__x);
  for (int __i = 1; __i < __n; ++__i) {
    float __x2 = (2 * __i) / __x * __x1 - __x0;
    __x0 = __x1;
    __x1 = __x2;
  }

  return __x1;
}
#if defined(__LP64__) || defined(_WIN64)
// __DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
__DEVICE__ long labs(long __x) {
  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
  return (__x ^ __sgn) - __sgn;
}
#else
__DEVICE__ long labs(long __x) {
  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
  return (__x ^ __sgn) - __sgn;
}
#endif
__DEVICE__ double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
__DEVICE__ float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
__DEVICE__ double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
__DEVICE__ float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
__DEVICE__ long long llabs(long long __x) {
  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
  return (__x ^ __sgn) - __sgn;
}
__DEVICE__ long long int llmax(long long int x, long long int y) { return x > y ? x : y; }
__DEVICE__ long long int llmin(long long int x, long long int y) { return x < y ? x : y; }
__DEVICE__ long long int llrint(double __x) { return __ocml_rint_f64(__x); }
__DEVICE__ long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
__DEVICE__ long long int llround(double __x) { return __ocml_round_f64(__x); }
__DEVICE__ double round(double __x) { return __ocml_round_f64(__x); }
__DEVICE__ float roundf(float __x) { return __ocml_round_f32(__x); }
__DEVICE__ double log(double __x) { return __ocml_log_f64(__x); }
__DEVICE__ double log10(double __x) { return __ocml_log10_f64(__x); }
__DEVICE__ float log10f(float x) {
#ifdef __FAST_MATH__
  return __log10f(x);
#else
  return __ocml_log10_f32(x);
#endif
}
__DEVICE__ double log1p(double __x) { return __ocml_log1p_f64(__x); }
__DEVICE__ float log1pf(float __x) { return __ocml_log1p_f32(__x); }
__DEVICE__ double log2(double __x) { return __ocml_log2_f64(__x); }
__DEVICE__ float log2f(float x) {
#ifdef __FAST_MATH__
  return __log2f(x);
#else
  return __ocml_log2_f32(x);
#endif
}
__DEVICE__ double logb(double __x) { return __ocml_logb_f64(__x); }
__DEVICE__ float logbf(float __x) { return __ocml_logb_f32(__x); }
__DEVICE__ float logf(float x) {
#ifdef __FAST_MATH__
  return __logf(x);
#else
  return __ocml_log_f32(x);
#endif
}

#if defined(__LP64__) || defined(_WIN64)
// __DEVICE__ long lrint(double __a) { return llrint(__a); }
// __DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
// __DEVICE__ long lround(double __a) { return llround(__a); }
// __DEVICE__ long lroundf(float __a) { return llroundf(__a); }
__DEVICE__ long int lrint(double __x) { return __ocml_rint_f64(__x); }
__DEVICE__ long int lrintf(float __x) { return __ocml_rint_f32(__x); }
__DEVICE__ long int lround(double __x) { return __ocml_round_f64(__x); }
__DEVICE__ long int lroundf(float __x) { return __ocml_round_f32(__x); }
__DEVICE__ long long int llroundf(float x) { return __ocml_round_f32(x); }
#else
__DEVICE__ long int lrint(double __x) { return __ocml_rint_f64(__x); }
__DEVICE__ long int lrintf(float __x) { return __ocml_rint_f32(__x); }
__DEVICE__ long int lround(double __x) { return __ocml_round_f64(__x); }
__DEVICE__ long int lroundf(float __x) { return __ocml_round_f32(__x); }
__DEVICE__ long long int llroundf(float x) { return __ocml_round_f32(x); }
#endif
__DEVICE__ int min(int __arg1, int __arg2) {
  return (__arg1 < __arg2) ? __arg1 : __arg2;
}
__DEVICE__ int max(int __arg1, int __arg2) {
  return (__arg1 > __arg2) ? __arg1 : __arg2;
}

__DEVICE__ double modf(double __x, double *__iptr) {
  double __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  double __r =
      __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
  *__iptr = __tmp;

  return __r;
}
__DEVICE__ float modff(float __x, float *__iptr) {
  float __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  float __r =
      __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  *__iptr = __tmp;
  return __r;
}
__DEVICE__ double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
__DEVICE__ float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
__DEVICE__ double nextafter(double __x, double __y) {
  return __ocml_nextafter_f64(__x, __y);
}
__DEVICE__ float nextafterf(float __x, float __y) {
  return __ocml_nextafter_f32(__x, __y);
}
__DEVICE__ double norm(int __dim,
            const double *__a) { // TODO: placeholder until OCML adds support.
  double __r = 0;
  while (__dim--) {
    __r += __a[0] * __a[0];
    ++__a;
  }

  return __ocml_sqrt_f64(__r);
}
__DEVICE__ double norm3d(double __x, double __y, double __z) {
  return __ocml_len3_f64(__x, __y, __z);
}
__DEVICE__ float norm3df(float __x, float __y, float __z) {
  return __ocml_len3_f32(__x, __y, __z);
}
__DEVICE__ double norm4d(double __x, double __y, double __z, double __w) {
  return __ocml_len4_f64(__x, __y, __z, __w);
}
__DEVICE__ float norm4df(float __x, float __y, float __z, float __w) {
  return __ocml_len4_f32(__x, __y, __z, __w);
}
__DEVICE__ double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
__DEVICE__ float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
__DEVICE__ double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
__DEVICE__ float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
__DEVICE__ float normf(int __dim,
            const float *__a) { // TODO: placeholder until OCML adds support.
  float __r = 0;
  while (__dim--) {
    __r += __a[0] * __a[0];
    ++__a;
  }

  return __ocml_sqrt_f32(__r);
}
__DEVICE__ double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
__DEVICE__ float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
__DEVICE__ double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); }
__DEVICE__ float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); }
__DEVICE__ double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
__DEVICE__ float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
__DEVICE__ double remainder(double __x, double __y) {
  return __ocml_remainder_f64(__x, __y);
}
__DEVICE__ float remainderf(float __x, float __y) {
  return __ocml_remainder_f32(__x, __y);
}
__DEVICE__ double remquo(double __x, double __y, int *__quo) {
  int __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  double __r = __ocml_remquo_f64(
      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
  *__quo = __tmp;

  return __r;
}
__DEVICE__ float remquof(float __x, float __y, int *__quo) {
  int __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  float __r = __ocml_remquo_f32(
      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
  *__quo = __tmp;

  return __r;
}
__DEVICE__ double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); }
__DEVICE__ float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); }
// __nv_rint* in libdevice is buggy and produces incorrect results.
__DEVICE__ double rint(double __x) { return __ocml_rint_f64(__x); }
__DEVICE__ float rintf(float __x) { return __ocml_rint_f32(__x); }
__DEVICE__ double rnorm(int __dim,
             const double *__a) { // TODO: placeholder until OCML adds support.
  double __r = 0;
  while (__dim--) {
    __r += __a[0] * __a[0];
    ++__a;
  }

  return __ocml_rsqrt_f64(__r);
}
__DEVICE__ double rnorm3d(double __x, double __y, double __z) {
  return __ocml_rlen3_f64(__x, __y, __z);
}
__DEVICE__ float rnorm3df(float __x, float __y, float __z) {
  return __ocml_rlen3_f32(__x, __y, __z);
}
__DEVICE__ double rnorm4d(double __x, double __y, double __z, double __w) {
  return __ocml_rlen4_f64(__x, __y, __z, __w);
}
__DEVICE__ float rnorm4df(float __x, float __y, float __z, float __w) {
  return __ocml_rlen4_f32(__x, __y, __z, __w);
}
__DEVICE__ float rnormf(int __dim,
             const float *__a) { // TODO: placeholder until OCML adds support.
  float __r = 0;
  while (__dim--) {
    __r += __a[0] * __a[0];
    ++__a;
  }

  return __ocml_rsqrt_f32(__r);
}
__DEVICE__ double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
__DEVICE__ float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
__DEVICE__ double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); }
__DEVICE__ float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
__DEVICE__ double scalbln(double __x, long int __n) {
  return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n)
                         : __ocml_scalb_f64(__x, __n);
}
__DEVICE__ float scalblnf(float __x, long int __n) {
  return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n)
                         : __ocml_scalb_f32(__x, __n);
}
__DEVICE__ double sin(double __x) { return __ocml_sin_f64(__x); }
__DEVICE_VOID__ void sincos(double __x, double *__sinptr, double *__cosptr) {
  double __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  *__sinptr = __ocml_sincos_f64(
      __x, (__attribute__((address_space(5))) double *)&__tmp);
  *__cosptr = __tmp;
}
__DEVICE_VOID__ void sincosf(float __x, float *__sinptr, float *__cosptr) {
#ifdef __FAST_MATH__
  __sincosf(__x, __sinptr, __cosptr);
#else
  float __tmp;
  #ifdef __OPENMP_AMDGCN__
  #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
  #endif
  *__sinptr =
      __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  *__cosptr = __tmp;
#endif
}
__DEVICE_VOID__ void sincospi(double __x, double *__sinptr, double *__cosptr) {
  double __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  *__sinptr = __ocml_sincospi_f64(
      __x, (__attribute__((address_space(5))) double *)&__tmp);
  *__cosptr = __tmp;
}
__DEVICE_VOID__ void sincospif(float __x, float *__sinptr, float *__cosptr) {
  float __tmp;
#ifdef __OPENMP_AMDGCN__
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
#endif
  *__sinptr = __ocml_sincospi_f32(
      __x, (__attribute__((address_space(5))) float *)&__tmp);
  *__cosptr = __tmp;
}
__DEVICE__ float sinf(float x) { 
#ifdef __FAST_MATH__
  return __sinf(x);
#else
  return __ocml_sin_f32(x); 
#endif
}
__DEVICE__ double sinh(double __x) { return __ocml_sinh_f64(__x); }
__DEVICE__ float sinhf(float __x) { return __ocml_sinh_f32(__x); }
__DEVICE__ double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
__DEVICE__ float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
__DEVICE__ double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
__DEVICE__ float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
__DEVICE__ double tan(double __x) { return __ocml_tan_f64(__x); }
__DEVICE__ float tanf(float __x) { return __ocml_tan_f32(__x); }
__DEVICE__ double tanh(double __x) { return __ocml_tanh_f64(__x); }
__DEVICE__ float tanhf(float __x) { return __ocml_tanh_f32(__x); }
__DEVICE__ double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
__DEVICE__ float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
__DEVICE__ double trunc(double __x) { return __ocml_trunc_f64(__x); }
__DEVICE__ float truncf(float __x) { return __ocml_trunc_f32(__x); }
__DEVICE__ long long int ullmax(unsigned long long int x, unsigned long long int y) { return x > y ? x : y; }
__DEVICE__ unsigned long long int ullmin(unsigned long long int x, unsigned long long int y) { return x < y ? x : y; }
__DEVICE__ unsigned int umax(unsigned int x, unsigned int y) { return x > y ? x : y; }
__DEVICE__ unsigned int umin(unsigned int x, unsigned int y) { return x < y ? x : y; }

__DEVICE__ double y0(double __x) { return __ocml_y0_f64(__x); }
__DEVICE__ float y0f(float __x) { return __ocml_y0_f32(__x); }
__DEVICE__ double y1(double __x) { return __ocml_y1_f64(__x); }
__DEVICE__ float y1f(float __x) { return __ocml_y1_f32(__x); }
__DEVICE__ double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication
                                 // and the Miller & Brown algorithm
  //       for linear recurrences to get O(log n) steps, but it's unclear if
  //       it'd be beneficial in this case. Placeholder until OCML adds
  //       support.
  if (__n == 0)
    return y0(__x);
  if (__n == 1)
    return y1(__x);

  double __x0 = y0(__x);
  double __x1 = y1(__x);
  for (int __i = 1; __i < __n; ++__i) {
    double __x2 = (2 * __i) / __x * __x1 - __x0;
    __x0 = __x1;
    __x1 = __x2;
  }

  return __x1;
}

__DEVICE__ float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication
                                // and the Miller & Brown algorithm
  //       for linear recurrences to get O(log n) steps, but it's unclear if
  //       it'd be beneficial in this case. Placeholder until OCML adds
  //       support.
  if (__n == 0)
    return y0f(__x);
  if (__n == 1)
    return y1f(__x);

  float __x0 = y0f(__x);
  float __x1 = y1f(__x);
  for (int __i = 1; __i < __n; ++__i) {
    float __x2 = (2 * __i) / __x * __x1 - __x0;
    __x0 = __x1;
    __x1 = __x2;
  }

  return __x1;
}

#pragma pop_macro("__DEVICE__")
#pragma pop_macro("__DEVICE_VOID__")
#pragma pop_macro("__FAST_OR_SLOW")

#endif // __CLANG_CUDAMOCKER_MATH_H__
