segment_kernel.cu 6.42 KB
Newer Older
rusty1s's avatar
rusty1s committed
1
2
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
rusty1s's avatar
rusty1s committed
3
4
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
rusty1s's avatar
rusty1s committed
5
6
7
8
9
10

#include <THC/THCGeneral.h>
#include <THC/THCThrustAllocator.cuh>

#include <thrust/execution_policy.h>

rusty1s's avatar
rusty1s committed
11
#include "atomics.cuh"
rusty1s's avatar
rusty1s committed
12
#include "compat.cuh"
rusty1s's avatar
rusty1s committed
13
#include "index.cuh"
rusty1s's avatar
rusty1s committed
14

rusty1s's avatar
rusty1s committed
15
#define THREADS 256
rusty1s's avatar
rusty1s committed
16
#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
rusty1s's avatar
rusty1s committed
17
18
#define FULL_MASK 0xffffffff

rusty1s's avatar
rusty1s committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// template <typename scalar_t, int TB>
// __global__ void segment_add_csr_broadcast_kernel(const scalar_t *src_data,
//                                                  const int64_t *indptr_data,
//                                                  scalar_t *out_data,
//                                                  size_t numel) {}

// template <typename T, typename I> struct IndexPtrToOffset<T, I> {
// static inline __host__ __device__ I
// get(I idx, const at::cuda::detail::TensorInfo<T, I> &info) {

//   return idx;
// I offset = idx % (info.sizes[info.dims - 1] - 1);
// idx /= info.sizes[info.dims - 1] - 1;
// for (int i = info.dims - 2; i >= 0; --i) {
//   offset += (idx % info.sizes[i]) * info.strides[i];
//   idx /= info.sizes[i];
// }
// return offset;
// }
// };

template <typename T, typename I> struct IndexPtrToOffset {
  static __host__ __device__ I
  get(I idx, const at::cuda::detail::TensorInfo<T, I> &info) {
    I offset = idx % (info.sizes[info.dims - 1] - 1);
rusty1s's avatar
rusty1s committed
44
    offset *= info.strides[info.dims - 1];
rusty1s's avatar
rusty1s committed
45
46
47
48
49
50
51
52
53
    idx /= info.sizes[info.dims - 1] - 1;
    for (int i = info.dims - 2; i >= 0; --i) {
      offset += (idx % info.sizes[i]) * info.strides[i];
      idx /= info.sizes[i];
    }
    return offset;
  }
};

rusty1s's avatar
rusty1s committed
54
template <typename scalar_t, int TB>
rusty1s's avatar
rusty1s committed
55
56
57
58
__global__ void segment_add_csr_kernel(
    const scalar_t *src_data,
    const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
    scalar_t *out_data, size_t N, size_t E) {
rusty1s's avatar
rusty1s committed
59
60
61
62
63

  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int warp_idx = thread_idx / TB;
  int lane_idx = thread_idx & (TB - 1);

rusty1s's avatar
rusty1s committed
64
65
66
  if (warp_idx < N) {
    auto offset = IndexPtrToOffset<int64_t, int>::get(warp_idx, indptr_info);
    int row_start = __ldg(indptr_info.data + offset);
rusty1s's avatar
rusty1s committed
67
68
    int row_end = __ldg(indptr_info.data + offset +
                        indptr_info.strides[indptr_info.dims - 1]);
rusty1s's avatar
rusty1s committed
69
70
    scalar_t val = (scalar_t)0;

rusty1s's avatar
rusty1s committed
71
    offset = (warp_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E;
rusty1s's avatar
rusty1s committed
72
    for (int src_idx = row_start + lane_idx; src_idx < row_end; src_idx += TB) {
rusty1s's avatar
rusty1s committed
73
      val += src_data[offset + src_idx];
rusty1s's avatar
rusty1s committed
74
75
76
    }

#pragma unroll
rusty1s's avatar
rusty1s committed
77
78
    for (int i = TB / 2; i > 0; i /= 2)
      val += __shfl_down_sync(FULL_MASK, val, i); // Parallel reduction.
rusty1s's avatar
rusty1s committed
79
80
81
82
83
84
85

    if (lane_idx == 0) {
      out_data[warp_idx] = val;
    }
  }
}

rusty1s's avatar
rusty1s committed
86
at::Tensor segment_add_csr_cuda(at::Tensor src, at::Tensor indptr) {
rusty1s's avatar
rusty1s committed
87
  AT_ASSERTM(src.dim() >= indptr.dim());
rusty1s's avatar
rusty1s committed
88
  src = src.contiguous();
rusty1s's avatar
rusty1s committed
89
90
91
92
93

  auto reduce_dim = indptr.dim() - 1;
  auto sizes = src.sizes().vec();
  sizes[reduce_dim] = indptr.size(reduce_dim) - 1;
  auto out = at::empty(sizes, src.options());
rusty1s's avatar
rusty1s committed
94

rusty1s's avatar
rusty1s committed
95
96
97
  auto N = (indptr.size(-1) - 1) * (indptr.numel() / indptr.size(-1));
  auto E = src.size(reduce_dim);
  auto avg_length = (float)src.size(reduce_dim) / (float)out.size(reduce_dim);
rusty1s's avatar
rusty1s committed
98

rusty1s's avatar
rusty1s committed
99
  auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
rusty1s's avatar
rusty1s committed
100
  auto stream = at::cuda::getCurrentCUDAStream();
rusty1s's avatar
rusty1s committed
101
  AT_DISPATCH_ALL_TYPES(src.scalar_type(), "segment_add_csr_kernel", [&] {
rusty1s's avatar
rusty1s committed
102
103
104
    auto src_data = src.DATA_PTR<scalar_t>();
    auto out_data = out.DATA_PTR<scalar_t>();

rusty1s's avatar
typo  
rusty1s committed
105
    if (avg_length <= 4)
rusty1s's avatar
rusty1s committed
106
107
      segment_add_csr_kernel<scalar_t, 4><<<BLOCKS(4, N), THREADS, 0, stream>>>(
          src_data, indptr_info, out_data, N, E);
rusty1s's avatar
typo  
rusty1s committed
108
    else if (avg_length <= 8)
rusty1s's avatar
rusty1s committed
109
110
      segment_add_csr_kernel<scalar_t, 8><<<BLOCKS(8, N), THREADS, 0, stream>>>(
          src_data, indptr_info, out_data, N, E);
rusty1s's avatar
typo  
rusty1s committed
111
    else if (avg_length <= 16)
rusty1s's avatar
rusty1s committed
112
      segment_add_csr_kernel<scalar_t, 16>
rusty1s's avatar
rusty1s committed
113
114
          <<<BLOCKS(16, N), THREADS, 0, stream>>>(src_data, indptr_info,
                                                  out_data, N, E);
rusty1s's avatar
typo  
rusty1s committed
115
    else
rusty1s's avatar
rusty1s committed
116
      segment_add_csr_kernel<scalar_t, 32>
rusty1s's avatar
rusty1s committed
117
118
          <<<BLOCKS(32, N), THREADS, 0, stream>>>(src_data, indptr_info,
                                                  out_data, N, E);
rusty1s's avatar
rusty1s committed
119
120
121
122
123
  });

  return out;
}

rusty1s's avatar
rusty1s committed
124
template <typename scalar_t, int TB>
rusty1s's avatar
rusty1s committed
125
126
127
128
129
__global__ void segment_add_coo_kernel(const scalar_t *src_data,
                                       const int64_t *index_data,
                                       scalar_t *out_data, size_t numel) {

  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
rusty1s's avatar
rusty1s committed
130
  int lane_idx = thread_idx & (TB - 1);
rusty1s's avatar
rusty1s committed
131
132
133
134
135
136

  if (thread_idx < numel) {
    auto idx = __ldg(index_data + thread_idx);
    scalar_t val = src_data[thread_idx], tmp;

#pragma unroll
rusty1s's avatar
rusty1s committed
137
    for (int offset = 1; offset < TB; offset *= 2) {
rusty1s's avatar
rusty1s committed
138
139
140
141
142
143
144
      tmp = __shfl_up_sync(FULL_MASK, val, offset);
      if (lane_idx >= offset &&
          idx == __ldg(index_data + thread_idx - offset)) {
        val += tmp;
      }
    }

rusty1s's avatar
rusty1s committed
145
    if (lane_idx == TB - 1 || idx != __ldg(index_data + thread_idx + 1)) {
rusty1s's avatar
rusty1s committed
146
147
148
149
150
151
152
      atomAdd(out_data + idx, val);
    }
  }
}

void segment_add_coo_cuda(at::Tensor src, at::Tensor index, at::Tensor out) {
  auto numel = src.numel();
rusty1s's avatar
rusty1s committed
153
  auto avg_length = (float)numel / (float)out.numel();
rusty1s's avatar
rusty1s committed
154
155
156
157
158
159
160

  auto index_data = index.DATA_PTR<int64_t>();
  auto stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_ALL_TYPES(src.scalar_type(), "segment_add_coo_kernel", [&] {
    auto src_data = src.DATA_PTR<scalar_t>();
    auto out_data = out.DATA_PTR<scalar_t>();

rusty1s's avatar
rusty1s committed
161
162
163
    segment_add_coo_kernel<scalar_t, 32>
        <<<BLOCKS(1, numel), THREADS, 0, stream>>>(src_data, index_data,
                                                   out_data, numel);
rusty1s's avatar
rusty1s committed
164
  });
rusty1s's avatar
rusty1s committed
165
166
}

rusty1s's avatar
rusty1s committed
167
168
void segment_add_thrust_cuda(at::Tensor src, at::Tensor index, at::Tensor out) {
  auto stream = at::cuda::getCurrentCUDAStream();
rusty1s's avatar
rusty1s committed
169
170
171
  auto allocator = THCThrustAllocator(at::globalContext().lazyInitCUDA());
  auto policy = thrust::cuda::par(allocator).on(stream);

rusty1s's avatar
rusty1s committed
172
173
  auto key = at::full_like(out, -1, out.options().dtype(at::kLong));

rusty1s's avatar
rusty1s committed
174
  auto index_data = thrust::device_ptr<int64_t>(index.DATA_PTR<int64_t>());
rusty1s's avatar
rusty1s committed
175
176
  auto key_data = thrust::device_ptr<int64_t>(key.DATA_PTR<int64_t>());

rusty1s's avatar
rusty1s committed
177
  AT_DISPATCH_ALL_TYPES(src.scalar_type(), "segment_add_thrust_kernel", [&] {
rusty1s's avatar
rusty1s committed
178
179
    auto src_data = thrust::device_ptr<scalar_t>(src.DATA_PTR<scalar_t>());
    auto out_data = thrust::device_ptr<scalar_t>(out.DATA_PTR<scalar_t>());
rusty1s's avatar
rusty1s committed
180

rusty1s's avatar
rusty1s committed
181
    thrust::reduce_by_key(policy, index_data, index_data + index.numel(),
rusty1s's avatar
rusty1s committed
182
                          src_data, key_data, out_data);
rusty1s's avatar
rusty1s committed
183
184
  });
}