cuda_rocm_interop.h 2.27 KB
Newer Older
1
2
3
/*!
 * Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 */
Jeff Daily's avatar
Jeff Daily committed
4
5
#ifndef LIGHTGBM_CUDA_CUDA_ROCM_INTEROP_H_
#define LIGHTGBM_CUDA_CUDA_ROCM_INTEROP_H_
6

7
#ifdef USE_CUDA
8

9
10
#if defined(__HIP_PLATFORM_AMD__)

11
12
13
14
// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask.
// Since mask is full 0xffffffff, we can use __shfl_down instead.
#define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset)
#define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset)
15

16
17
// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
#define atomicAdd_block atomicAdd
18

19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// hipify
#include <hip/hip_runtime.h>
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaFree hipFree
#define cudaFreeHost hipFreeHost
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorName hipGetErrorName
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostAlloc hipHostAlloc
#define cudaHostAllocPortable hipHostAllocPortable
#define cudaMalloc hipMalloc
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemoryTypeHost hipMemoryTypeHost
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaSetDevice hipSetDevice
#define cudaStreamCreate hipStreamCreate
#define cudaStreamDestroy hipStreamDestroy
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
48
49
50
51
52
53
54

// warpSize is only allowed for device code.
// HIP header used to define warpSize as a constexpr that was either 32 or 64
// depending on the target device, and then always set it to 64 for host code.
static inline constexpr int WARP_SIZE_INTERNAL() {
#if defined(__GFX9__)
  return 64;
Jeff Daily's avatar
lint  
Jeff Daily committed
55
#else  // __GFX9__
56
  return 32;
Jeff Daily's avatar
lint  
Jeff Daily committed
57
#endif  // __GFX9__
58
59
60
}
#define WARPSIZE (WARP_SIZE_INTERNAL())

Jeff Daily's avatar
lint  
Jeff Daily committed
61
#else  // __HIP_PLATFORM_AMD__
62
63
64
65
// CUDA warpSize is not a constexpr, but always 32
#define WARPSIZE 32
#endif

66
#endif  // USE_CUDA
Jeff Daily's avatar
Jeff Daily committed
67
68

#endif  // LIGHTGBM_CUDA_CUDA_ROCM_INTEROP_H_