/**#
   *  @file: cuda_runtime_api.h
   *  @brief: cuda runtime API header wrapper
   *  @author: louis
   *  @data: 2020/4/17
   #**/

#if !defined(__CUDA_RUNTIME_API_H__)
#define __CUDA_RUNTIME_API_H__

#define CUDART_VERSION  10000

#include "hip/hip_runtime.h"
#include "hip/hip_cuda_type.h"

/** \cond impl_private */
#if !defined(__dv)

#if defined(__cplusplus)

#define __dv(v) \
        = v

#else /* __cplusplus */

#define __dv(v)

#endif /* __cplusplus */

#endif /* !__dv */
/** \endcond impl_private */

#define CUDART_DEVICE __device__

/** \cond impl_private */
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif

#ifdef _WIN32
#define CUDART_CB __stdcall
#else
#define CUDART_CB
#endif

#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */

extern __host__ cudaError_t  cudaDeviceReset(void);

extern __host__  cudaError_t  cudaDeviceSynchronize(void);

extern __host__  cudaError_t  cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);

extern __host__  cudaError_t  cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);

extern __host__ cudaError_t  cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig);

extern __host__  cudaError_t  cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority);

extern __host__  cudaError_t  cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);

extern __host__ cudaError_t  cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config);

extern __host__ cudaError_t  cudaDeviceGetByPCIBusId(int *device, const char *pciBusId);

extern __host__ cudaError_t  cudaDeviceGetPCIBusId(char *pciBusId, int len, int device);

extern __host__ cudaError_t  cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event);

extern __host__ cudaError_t  cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle);

extern __host__ cudaError_t  cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr);

extern __host__ cudaError_t  cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags);

extern __host__ cudaError_t  cudaIpcCloseMemHandle(void *devPtr);

/*
extern __CUDA_DEPRECATED __host__ cudaError_t  cudaThreadExit(void);

extern __CUDA_DEPRECATED __host__ cudaError_t  cudaThreadSynchronize(void);

extern __CUDA_DEPRECATED __host__ cudaError_t  cudaThreadSetLimit(enum cudaLimit limit, size_t value);

extern __CUDA_DEPRECATED __host__ cudaError_t  cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit);

extern __CUDA_DEPRECATED __host__ cudaError_t  cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig);

extern __CUDA_DEPRECATED __host__ cudaError_t  cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig);
*/
extern __host__  cudaError_t  cudaGetLastError(void);

extern __host__  cudaError_t  cudaPeekAtLastError(void);

/**
 * \brief Returns the string representation of an error code enum name
 *
 * Returns a string containing the name of an error code in the enum.  If the error
 * code is not recognized, "unrecognized error code" is returned.
 *
 * \param error - Error code to convert to string
 *
 * \return
 * \p char* pointer to a NULL-terminated string
 *
 * \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError,
 * ::cuGetErrorName
 */
extern __host__  const char*  cudaGetErrorName(cudaError_t error);

extern __host__  const char*  cudaGetErrorString(cudaError_t error);

extern __host__  cudaError_t  cudaGetDeviceCount(int *count);

extern __host__  cudaError_t  cudaGetDeviceProperties(cudaDeviceProp *prop, int device);

extern __host__  cudaError_t  cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);

//extern __host__  cudaError_t  cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);

extern __host__ cudaError_t  cudaChooseDevice(int *device, const cudaDeviceProp *prop);

extern __host__ cudaError_t  cudaSetDevice(int device);

extern __host__  cudaError_t  cudaGetDevice(int *device);

//extern __host__ cudaError_t  cudaSetValidDevices(int *device_arr, int len);

extern __host__ cudaError_t  cudaSetDeviceFlags( unsigned int flags );

extern __host__ cudaError_t  cudaGetDeviceFlags( unsigned int *flags );

extern __host__ cudaError_t  cudaStreamCreate(cudaStream_t *pStream);

extern __host__  cudaError_t  cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);

extern __host__  cudaError_t  cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority);

extern __host__  cudaError_t  cudaStreamGetPriority(cudaStream_t hStream, int *priority);

extern __host__  cudaError_t  cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);

extern __host__  cudaError_t  cudaStreamDestroy(cudaStream_t stream);

extern __host__  cudaError_t  cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);

typedef void (CUDART_CB *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData);

extern __host__ cudaError_t  cudaStreamAddCallback(cudaStream_t stream,
cudaStreamCallback_t callback, void *userData, unsigned int flags);

extern __host__ cudaError_t  cudaStreamSynchronize(cudaStream_t stream);

extern __host__ cudaError_t  cudaStreamQuery(cudaStream_t stream);

extern __host__  cudaError_t  cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags __dv(cudaMemAttachSingle));
/*
extern __host__ cudaError_t  cudaStreamBeginCapture(cudaStream_t stream);

extern __host__ cudaError_t  cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);

extern __host__ cudaError_t  cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus);
*/
extern __host__ cudaError_t  cudaEventCreate(cudaEvent_t *event);

extern __host__  cudaError_t  cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);

extern __host__  cudaError_t  cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));

extern __host__ cudaError_t  cudaEventQuery(cudaEvent_t event);

extern __host__ cudaError_t  cudaEventSynchronize(cudaEvent_t event);

extern __host__  cudaError_t  cudaEventDestroy(cudaEvent_t event);

extern __host__ cudaError_t  cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end);

extern __host__ cudaError_t  cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);

extern __host__ cudaError_t  cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);

extern __host__ cudaError_t  cudaLaunchCooperativeKernelMultiDevice(cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags  __dv(0));

extern __host__ cudaError_t  cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);

//extern __host__ cudaError_t  cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config);

extern __host__  cudaError_t  cudaFuncGetAttributes(cudaFuncAttributes *attr, const void *func);

//extern __host__  cudaError_t  cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value);

//extern __CUDA_DEPRECATED __host__ cudaError_t  cudaSetDoubleForDevice(double *d);

//extern __CUDA_DEPRECATED  __host__ cudaError_t  cudaSetDoubleForHost(double *d);

//extern __host__ cudaError_t  cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData);

extern __host__  cudaError_t  cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize);

extern __host__  cudaError_t  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags);

extern __host__ cudaError_t  cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));

extern __host__ cudaError_t  cudaSetupArgument(const void *arg, size_t size, size_t offset);

//extern __host__ cudaError_t  cudaLaunch(const void *func);

extern __host__  cudaError_t  cudaMallocManaged(void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal));

extern __host__  cudaError_t  cudaMalloc(void **devPtr, size_t size);

extern __host__ cudaError_t  cudaMallocHost(void **ptr, size_t size);

extern __host__ cudaError_t  cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);

extern __host__ cudaError_t  cudaMallocArray(cudaArray_t *array, const cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0));

extern __host__  cudaError_t  cudaFree(void *devPtr);

extern __host__ cudaError_t  cudaFreeHost(void *ptr);

extern __host__ cudaError_t  cudaFreeArray(cudaArray_t array);

extern __host__ cudaError_t  cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray);

extern __host__ cudaError_t  cudaHostAlloc(void **pHost, size_t size, unsigned int flags);

extern __host__ cudaError_t  cudaHostRegister(void *ptr, size_t size, unsigned int flags);

extern __host__ cudaError_t  cudaHostUnregister(void *ptr);

extern __host__ cudaError_t  cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);

extern __host__ cudaError_t  cudaHostGetFlags(unsigned int *pFlags, void *pHost);

extern __host__ cudaError_t  cudaMalloc3D(cudaPitchedPtr* pitchedDevPtr, cudaExtent extent);

extern __host__ cudaError_t  cudaMalloc3DArray(cudaArray_t *array, const cudaChannelFormatDesc* desc, cudaExtent extent, unsigned int flags __dv(0));

extern __host__ cudaError_t  cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const cudaChannelFormatDesc* desc, cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0));

extern __host__ cudaError_t  cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level);

extern __host__ cudaError_t  cudaMemcpy3D(const cudaMemcpy3DParms *p);

extern __host__  cudaError_t  cudaMemcpy3DAsync(const cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));

//extern __host__ cudaError_t  cudaMemGetInfo(size_t *free, size_t *total);

extern __host__ cudaError_t  cudaArrayGetInfo(cudaChannelFormatDesc *desc, cudaExtent *extent, unsigned int *flags, cudaArray_t array);

extern __host__ cudaError_t  cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);

extern __host__ cudaError_t  cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count);

extern __host__ cudaError_t  cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);

extern __host__ cudaError_t  cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);

//extern __host__ cudaError_t  cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));

extern __host__ cudaError_t  cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);

extern __host__ cudaError_t  cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);

extern __host__ cudaError_t  cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);

//extern __host__ cudaError_t  cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));

extern __host__ cudaError_t  cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));

extern __host__ cudaError_t  cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));

extern __host__  cudaError_t  cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));

extern __host__ cudaError_t  cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0));

//extern __host__ cudaError_t  cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));

//extern __host__ cudaError_t  cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));

extern __host__  cudaError_t  cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));

//extern __host__ cudaError_t  cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));

//extern __host__ cudaError_t  cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));

extern __host__ cudaError_t  cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));

extern __host__ cudaError_t  cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));

extern __host__ cudaError_t  cudaMemset(void *devPtr, int value, size_t count);

extern __host__ cudaError_t  cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);

extern __host__ cudaError_t  cudaMemset3D(cudaPitchedPtr pitchedDevPtr, int value, cudaExtent extent);

extern __host__  cudaError_t  cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));

extern __host__  cudaError_t  cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));

extern __host__  cudaError_t  cudaMemset3DAsync(cudaPitchedPtr pitchedDevPtr, int value, cudaExtent extent, cudaStream_t stream __dv(0));

extern __host__ cudaError_t  cudaGetSymbolAddress(void **devPtr, const void *symbol);

extern __host__ cudaError_t  cudaGetSymbolSize(size_t *size, const void *symbol);

extern __host__ cudaError_t  cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream __dv(0));

extern __host__ cudaError_t  cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device);

extern __host__ cudaError_t  cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count);

extern __host__ cudaError_t  cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count);

extern __host__ cudaError_t  cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr);

extern __host__ cudaError_t  cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);

extern __host__ cudaError_t  cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);

extern __host__ cudaError_t  cudaDeviceDisablePeerAccess(int peerDevice);

extern __host__ cudaError_t  cudaGetChannelDesc(cudaChannelFormatDesc *desc, cudaArray_const_t array);

extern __host__ cudaChannelFormatDesc  cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f);

extern __host__ cudaError_t  cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX));

extern __host__ cudaError_t  cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch);

extern __host__ cudaError_t  cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const cudaChannelFormatDesc *desc);

extern __host__ cudaError_t  cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const cudaChannelFormatDesc *desc);

//TODO: will redefine textureReference with hip
extern __host__ cudaError_t  cudaUnbindTexture(const struct textureReference *texref);

extern __host__ cudaError_t  cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref);

extern __host__ cudaError_t  cudaGetTextureReference(const struct textureReference **texref, const void *symbol);

//extern __host__ cudaError_t  cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);

//extern __host__ cudaError_t  cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol);

extern __host__ cudaError_t  cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);

extern __host__ cudaError_t  cudaDestroyTextureObject(cudaTextureObject_t texObject);

extern __host__ cudaError_t  cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject);

extern __host__ cudaError_t  cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject);

extern __host__ cudaError_t  cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject);

extern __host__ cudaError_t  cudaDriverGetVersion(int *driverVersion);

extern __host__  cudaError_t  cudaRuntimeGetVersion(int *runtimeVersion);

#if defined(__cplusplus)
}

#endif /* __cplusplus */

#undef __dv
#undef __CUDA_DEPRECATED

#endif /* !__CUDA_RUNTIME_API_H__ */
