functor.cuh 2.04 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/*!
 *  Copyright (c) 2019 by Contributors
 * \file kernel/cuda/functor.cuh
 * \brief Functors for template on CUDA
 */
#ifndef DGL_KERNEL_CUDA_FUNCTOR_CUH_
#define DGL_KERNEL_CUDA_FUNCTOR_CUH_

#include "../binary_reduce_common.h"
#include "./atomic.cuh"

namespace dgl {
namespace kernel {
namespace cuda {

// Cache load from global memory
template <typename DType>
struct LDGReader {
  static __device__ __forceinline__ DType Call(DType* addr) {
#if __CUDA_ARCH__ >= 350
    return __ldg(addr);
#else
    return *addr;
#endif
  }
};

}  // namespace cuda

// Reducer functor specialization
template <typename DType>
struct ReduceSum<kDLGPU, DType> {
  static __device__ __forceinline__ void Call(DType* addr, DType val) {
    cuda::AtomicAdd(addr, val);
  }
  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
    return 1;
  }
};

template <typename DType>
struct ReduceMax<kDLGPU, DType> {
  static __device__ __forceinline__ void Call(DType* addr, DType val) {
    cuda::AtomicMax(addr, val);
  }
  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
    return static_cast<DType>(val == accum);
  }
};

template <typename DType>
struct ReduceMin<kDLGPU, DType> {
  static __device__ __forceinline__ void Call(DType* addr, DType val) {
    cuda::AtomicMin(addr, val);
  }
  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
    return static_cast<DType>(val == accum);
  }
};

template <typename DType>
struct ReduceProd<kDLGPU, DType> {
  static __device__ __forceinline__ void Call(DType* addr, DType val) {
    cuda::AtomicMul(addr, val);
  }
  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
    return accum / val;
  }
};

template <typename DType>
struct ReduceNone<kDLGPU, DType> {
  static __device__ __forceinline__ void Call(DType* addr, DType val) {
    *addr = val;
  }
  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
    return 1;
  }
};

}  // namespace kernel
}  // namespace dgl

#endif  // DGL_KERNEL_CUDA_FUNCTOR_CUH_