cuda_rocm_interop.h 2.38 KB
Newer Older
1
2
3
/*!
 * Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */
4
5
6

#ifndef LIGHTGBM_INCLUDE_LIGHTGBM_CUDA_CUDA_ROCM_INTEROP_H_
#define LIGHTGBM_INCLUDE_LIGHTGBM_CUDA_CUDA_ROCM_INTEROP_H_
7

8
#ifdef USE_CUDA
9

10
11
#if defined(__HIP_PLATFORM_AMD__)

12
13
// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
#define atomicAdd_block atomicAdd
14

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// hipify
#include <hip/hip_runtime.h>
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaFree hipFree
#define cudaFreeHost hipFreeHost
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorName hipGetErrorName
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostAlloc hipHostAlloc
#define cudaHostAllocPortable hipHostAllocPortable
#define cudaMalloc hipMalloc
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemoryTypeHost hipMemoryTypeHost
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaSetDevice hipSetDevice
#define cudaStreamCreate hipStreamCreate
#define cudaStreamDestroy hipStreamDestroy
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
44

Jeff Daily's avatar
Jeff Daily committed
45
46
47
48
49
// ROCm 7.0 did add __shfl_down_sync et al, but the following hack still works.
// Since mask is full 0xffffffff, we can use __shfl_down instead.
#define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset)
#define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset)

50
51
52
53
54
55
// warpSize is only allowed for device code.
// HIP header used to define warpSize as a constexpr that was either 32 or 64
// depending on the target device, and then always set it to 64 for host code.
static inline constexpr int WARP_SIZE_INTERNAL() {
#if defined(__GFX9__)
  return 64;
Jeff Daily's avatar
lint  
Jeff Daily committed
56
#else  // __GFX9__
57
  return 32;
Jeff Daily's avatar
lint  
Jeff Daily committed
58
#endif  // __GFX9__
59
60
61
}
#define WARPSIZE (WARP_SIZE_INTERNAL())

Jeff Daily's avatar
lint  
Jeff Daily committed
62
#else  // __HIP_PLATFORM_AMD__
63
64
// CUDA warpSize is not a constexpr, but always 32
#define WARPSIZE 32
65
#endif  // defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
66

67
#endif  // USE_CUDA
Jeff Daily's avatar
Jeff Daily committed
68

69
#endif  // LIGHTGBM_INCLUDE_LIGHTGBM_CUDA_CUDA_ROCM_INTEROP_H_