cuda_rocm_interop.h 2.18 KB
Newer Older
1
2
3
/*!
 * Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */
4
5
#pragma once

6
#if defined(USE_CUDA) || defined(USE_ROCM)
7

8
9
#if defined(__HIP_PLATFORM_AMD__)

10
11
12
13
// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask.
// Since mask is full 0xffffffff, we can use __shfl_down instead.
#define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset)
#define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset)
14

15
16
// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
#define atomicAdd_block atomicAdd
17

18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// hipify
#include <hip/hip_runtime.h>
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaFree hipFree
#define cudaFreeHost hipFreeHost
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorName hipGetErrorName
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostAlloc hipHostAlloc
#define cudaHostAllocPortable hipHostAllocPortable
#define cudaMalloc hipMalloc
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemoryTypeHost hipMemoryTypeHost
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaSetDevice hipSetDevice
#define cudaStreamCreate hipStreamCreate
#define cudaStreamDestroy hipStreamDestroy
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
47
48
49
50
51
52
53
54
55
56
57
58
59
60

// warpSize is only allowed for device code.
// HIP header used to define warpSize as a constexpr that was either 32 or 64
// depending on the target device, and then always set it to 64 for host code.
static inline constexpr int WARP_SIZE_INTERNAL() {
#if defined(__GFX9__)
  return 64;
#else // __GFX9__
  return 32;
#endif // __GFX9__
}
#define WARPSIZE (WARP_SIZE_INTERNAL())

#else // __HIP_PLATFORM_AMD__
61
62
63
64
// CUDA warpSize is not a constexpr, but always 32
#define WARPSIZE 32
#endif

65
#endif // USE_CUDA || USE_ROCM