segment_kernel.cu 6.37 KB
Newer Older
rusty1s's avatar
rusty1s committed
1
2
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
rusty1s's avatar
rusty1s committed
3
4
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
rusty1s's avatar
rusty1s committed
5
6
7
8
9
10

#include <THC/THCGeneral.h>
#include <THC/THCThrustAllocator.cuh>

#include <thrust/execution_policy.h>

rusty1s's avatar
rusty1s committed
11
#include "atomics.cuh"
rusty1s's avatar
rusty1s committed
12
#include "compat.cuh"
rusty1s's avatar
rusty1s committed
13
#include "index.cuh"
rusty1s's avatar
rusty1s committed
14

rusty1s's avatar
rusty1s committed
15
#define THREADS 256
rusty1s's avatar
rusty1s committed
16
#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
rusty1s's avatar
rusty1s committed
17
18
#define FULL_MASK 0xffffffff

rusty1s's avatar
rusty1s committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
// template <typename scalar_t, int TB>
// __global__ void segment_add_csr_broadcast_kernel(const scalar_t *src_data,
//                                                  const int64_t *indptr_data,
//                                                  scalar_t *out_data,
//                                                  size_t numel) {}

// template <typename T, typename I> struct IndexPtrToOffset<T, I> {
// static inline __host__ __device__ I
// get(I idx, const at::cuda::detail::TensorInfo<T, I> &info) {

//   return idx;
// I offset = idx % (info.sizes[info.dims - 1] - 1);
// idx /= info.sizes[info.dims - 1] - 1;
// for (int i = info.dims - 2; i >= 0; --i) {
//   offset += (idx % info.sizes[i]) * info.strides[i];
//   idx /= info.sizes[i];
// }
// return offset;
// }
// };

template <typename T, typename I> struct IndexPtrToOffset {
  static __host__ __device__ I
  get(I idx, const at::cuda::detail::TensorInfo<T, I> &info) {
    I offset = idx % (info.sizes[info.dims - 1] - 1);
    idx /= info.sizes[info.dims - 1] - 1;
    for (int i = info.dims - 2; i >= 0; --i) {
      offset += (idx % info.sizes[i]) * info.strides[i];
      idx /= info.sizes[i];
    }
    return offset;
  }
};

rusty1s's avatar
rusty1s committed
53
template <typename scalar_t, int TB>
rusty1s's avatar
rusty1s committed
54
55
56
57
__global__ void segment_add_csr_kernel(
    const scalar_t *src_data,
    const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
    scalar_t *out_data, size_t N, size_t E) {
rusty1s's avatar
rusty1s committed
58
59
60
61
62

  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int warp_idx = thread_idx / TB;
  int lane_idx = thread_idx & (TB - 1);

rusty1s's avatar
rusty1s committed
63
64
65
66
  if (warp_idx < N) {
    auto offset = IndexPtrToOffset<int64_t, int>::get(warp_idx, indptr_info);
    int row_start = __ldg(indptr_info.data + offset);
    int row_end = __ldg(indptr_info.data + offset + 1);
rusty1s's avatar
rusty1s committed
67
68
    scalar_t val = (scalar_t)0;

rusty1s's avatar
rusty1s committed
69
    offset = (warp_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E;
rusty1s's avatar
rusty1s committed
70
    for (int src_idx = row_start + lane_idx; src_idx < row_end; src_idx += TB) {
rusty1s's avatar
rusty1s committed
71
      val += src_data[offset + src_idx];
rusty1s's avatar
rusty1s committed
72
73
74
    }

#pragma unroll
rusty1s's avatar
rusty1s committed
75
76
    for (int i = TB / 2; i > 0; i /= 2)
      val += __shfl_down_sync(FULL_MASK, val, i); // Parallel reduction.
rusty1s's avatar
rusty1s committed
77
78
79
80
81
82
83

    if (lane_idx == 0) {
      out_data[warp_idx] = val;
    }
  }
}

rusty1s's avatar
rusty1s committed
84
at::Tensor segment_add_csr_cuda(at::Tensor src, at::Tensor indptr) {
rusty1s's avatar
rusty1s committed
85
86
87
88
89
90
91
92
  src = src.contiguous();
  AT_ASSERTM(indptr.stride(-1) == 1);
  AT_ASSERTM(src.dim() >= indptr.dim());

  auto reduce_dim = indptr.dim() - 1;
  auto sizes = src.sizes().vec();
  sizes[reduce_dim] = indptr.size(reduce_dim) - 1;
  auto out = at::empty(sizes, src.options());
rusty1s's avatar
rusty1s committed
93

rusty1s's avatar
rusty1s committed
94
95
96
  auto N = (indptr.size(-1) - 1) * (indptr.numel() / indptr.size(-1));
  auto E = src.size(reduce_dim);
  auto avg_length = (float)src.size(reduce_dim) / (float)out.size(reduce_dim);
rusty1s's avatar
rusty1s committed
97

rusty1s's avatar
rusty1s committed
98
  auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
rusty1s's avatar
rusty1s committed
99
  auto stream = at::cuda::getCurrentCUDAStream();
rusty1s's avatar
rusty1s committed
100
  AT_DISPATCH_ALL_TYPES(src.scalar_type(), "segment_add_csr_kernel", [&] {
rusty1s's avatar
rusty1s committed
101
102
103
    auto src_data = src.DATA_PTR<scalar_t>();
    auto out_data = out.DATA_PTR<scalar_t>();

rusty1s's avatar
rusty1s committed
104
105
106
107
108
109
110
    if (avg_length <= 4) {
      segment_add_csr_kernel<scalar_t, 4><<<BLOCKS(4, N), THREADS, 0, stream>>>(
          src_data, indptr_info, out_data, N, E);
    } else if (avg_length <= 8) {
      segment_add_csr_kernel<scalar_t, 8><<<BLOCKS(8, N), THREADS, 0, stream>>>(
          src_data, indptr_info, out_data, N, E);
    } else if (avg_length <= 16) {
rusty1s's avatar
rusty1s committed
111
      segment_add_csr_kernel<scalar_t, 16>
rusty1s's avatar
rusty1s committed
112
113
114
          <<<BLOCKS(16, N), THREADS, 0, stream>>>(src_data, indptr_info,
                                                  out_data, N, E);
    } else {
rusty1s's avatar
rusty1s committed
115
      segment_add_csr_kernel<scalar_t, 32>
rusty1s's avatar
rusty1s committed
116
117
118
          <<<BLOCKS(32, N), THREADS, 0, stream>>>(src_data, indptr_info,
                                                  out_data, N, E);
    }
rusty1s's avatar
rusty1s committed
119
120
121
122
123
  });

  return out;
}

rusty1s's avatar
rusty1s committed
124
template <typename scalar_t, int TB>
rusty1s's avatar
rusty1s committed
125
126
127
128
129
__global__ void segment_add_coo_kernel(const scalar_t *src_data,
                                       const int64_t *index_data,
                                       scalar_t *out_data, size_t numel) {

  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
rusty1s's avatar
rusty1s committed
130
  int lane_idx = thread_idx & (TB - 1);
rusty1s's avatar
rusty1s committed
131
132
133
134
135
136

  if (thread_idx < numel) {
    auto idx = __ldg(index_data + thread_idx);
    scalar_t val = src_data[thread_idx], tmp;

#pragma unroll
rusty1s's avatar
rusty1s committed
137
    for (int offset = 1; offset < TB; offset *= 2) {
rusty1s's avatar
rusty1s committed
138
139
140
141
142
143
144
      tmp = __shfl_up_sync(FULL_MASK, val, offset);
      if (lane_idx >= offset &&
          idx == __ldg(index_data + thread_idx - offset)) {
        val += tmp;
      }
    }

rusty1s's avatar
rusty1s committed
145
    if (lane_idx == TB - 1 || idx != __ldg(index_data + thread_idx + 1)) {
rusty1s's avatar
rusty1s committed
146
147
148
149
150
151
152
      atomAdd(out_data + idx, val);
    }
  }
}

void segment_add_coo_cuda(at::Tensor src, at::Tensor index, at::Tensor out) {
  auto numel = src.numel();
rusty1s's avatar
rusty1s committed
153
  auto avg_length = (float)numel / (float)out.numel();
rusty1s's avatar
rusty1s committed
154
155
156
157
158
159
160

  auto index_data = index.DATA_PTR<int64_t>();
  auto stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_ALL_TYPES(src.scalar_type(), "segment_add_coo_kernel", [&] {
    auto src_data = src.DATA_PTR<scalar_t>();
    auto out_data = out.DATA_PTR<scalar_t>();

rusty1s's avatar
rusty1s committed
161
162
163
    segment_add_coo_kernel<scalar_t, 32>
        <<<BLOCKS(1, numel), THREADS, 0, stream>>>(src_data, index_data,
                                                   out_data, numel);
rusty1s's avatar
rusty1s committed
164
  });
rusty1s's avatar
rusty1s committed
165
166
}

rusty1s's avatar
rusty1s committed
167
168
void segment_add_thrust_cuda(at::Tensor src, at::Tensor index, at::Tensor out) {
  auto stream = at::cuda::getCurrentCUDAStream();
rusty1s's avatar
rusty1s committed
169
170
171
  auto allocator = THCThrustAllocator(at::globalContext().lazyInitCUDA());
  auto policy = thrust::cuda::par(allocator).on(stream);

rusty1s's avatar
rusty1s committed
172
173
  auto key = at::full_like(out, -1, out.options().dtype(at::kLong));

rusty1s's avatar
rusty1s committed
174
  auto index_data = thrust::device_ptr<int64_t>(index.DATA_PTR<int64_t>());
rusty1s's avatar
rusty1s committed
175
176
  auto key_data = thrust::device_ptr<int64_t>(key.DATA_PTR<int64_t>());

rusty1s's avatar
rusty1s committed
177
  AT_DISPATCH_ALL_TYPES(src.scalar_type(), "segment_add_thrust_kernel", [&] {
rusty1s's avatar
rusty1s committed
178
179
    auto src_data = thrust::device_ptr<scalar_t>(src.DATA_PTR<scalar_t>());
    auto out_data = thrust::device_ptr<scalar_t>(out.DATA_PTR<scalar_t>());
rusty1s's avatar
rusty1s committed
180

rusty1s's avatar
rusty1s committed
181
    thrust::reduce_by_key(policy, index_data, index_data + index.numel(),
rusty1s's avatar
rusty1s committed
182
                          src_data, key_data, out_data);
rusty1s's avatar
rusty1s committed
183
184
  });
}