scatter_points_cuda.cu 10.9 KB
Newer Older
1
2
3
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/types.h>
4

5
6
7
#include <ATen/cuda/CUDAApplyUtils.cuh>

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
zhangwenwei's avatar
zhangwenwei committed
8

9
#define CHECK_CUDA(x) \
10
  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
11
#define CHECK_CONTIGUOUS(x) \
zhangwenwei's avatar
zhangwenwei committed
12
  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
13
14
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
zhangwenwei's avatar
zhangwenwei committed
15
16
17
  CHECK_CONTIGUOUS(x)

namespace {
18
19
int const threadsPerBlock = 512;
int const maxGridDim = 50000;
20
}  // namespace
21
22
23
24
25
26
27
28
29

__device__ __forceinline__ static void reduceMax(float *address, float val) {
  int *address_as_i = reinterpret_cast<int *>(address);
  int old = *address_as_i, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_i, assumed,
                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
  } while (assumed != old || __int_as_float(old) < val);
zhangwenwei's avatar
zhangwenwei committed
30
31
}

32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
__device__ __forceinline__ static void reduceMax(double *address, double val) {
  unsigned long long *address_as_ull =
      reinterpret_cast<unsigned long long *>(address);
  unsigned long long old = *address_as_ull, assumed;
  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, assumed,
        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
  } while (assumed != old || __longlong_as_double(old) < val);
}

// get rid of meaningless warnings when compiling host code
#ifdef __CUDA_ARCH__
__device__ __forceinline__ static void reduceAdd(float *address, float val) {
#if (__CUDA_ARCH__ < 200)
48
#warning \
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
  int *address_as_i = reinterpret_cast<int *>(address);
  int old = *address_as_i, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_i, assumed,
                    __float_as_int(val + __int_as_float(assumed)));
  } while (assumed != old);
#else
  atomicAdd(address, val);
#endif
}

__device__ __forceinline__ static void reduceAdd(double *address, double val) {
#if (__CUDA_ARCH__ < 600)
64
#warning \
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
  unsigned long long *address_as_ull =
      reinterpret_cast<unsigned long long *>(address);
  unsigned long long old = *address_as_ull, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed,
                    __double_as_longlong(val + __longlong_as_double(assumed)));
  } while (assumed != old);
#else
  atomicAdd(address, val);
#endif
}
#endif

80
81
82
83
84
85
template <typename T>
__global__ void
feats_reduce_kernel(const T *feats, const int32_t *coors_map,
                    T *reduced_feats, // shall be 0 at initialization
                    const int num_input, const int num_feats,
                    const reduce_t reduce_type) {
86
87
88
  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
       x += gridDim.x * blockDim.x) {
    int32_t reduce_to = coors_map[x];
89
    if (reduce_to == -1) continue;
90
91
92
93
94
95
96
97
98
99
100

    const T *feats_offset = feats + x * num_feats;
    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
    if (reduce_type == reduce_t::MAX) {
      for (int i = 0; i < num_feats; i++) {
        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
      }
    } else {
      for (int i = 0; i < num_feats; i++) {
        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
      }
zhangwenwei's avatar
zhangwenwei committed
101
102
103
104
    }
  }
}

105
106
107
108
109
110
111
112
113
template <typename T>
__global__ void add_reduce_traceback_grad_kernel(
    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
    const int32_t *reduce_count, const int num_input, const int num_feats,
    const reduce_t reduce_type) {
  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
       x += gridDim.x * blockDim.x) {
    int32_t reduce_to = coors_map[x];
    if (reduce_to == -1) {
114
      continue;
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
    }

    const int input_offset = x * num_feats;
    T *grad_feats_offset = grad_feats + input_offset;
    const int reduced_offset = reduce_to * num_feats;
    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;

    if (reduce_type == reduce_t::SUM) {
      for (int i = 0; i < num_feats; i++) {
        grad_feats_offset[i] = grad_reduced_feats_offset[i];
      }
    } else if (reduce_type == reduce_t::MEAN) {
      for (int i = 0; i < num_feats; i++) {
        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
                               static_cast<T>(reduce_count[reduce_to]);
zhangwenwei's avatar
zhangwenwei committed
130
131
132
133
134
      }
    }
  }
}

135
136
137
138
139
140
141
142
143
144
145
146
template <typename T>
__global__ void max_reduce_traceback_scatter_idx_kernel(
    const T *feats, const T *reduced_feats, int32_t *reduce_from,
    const int32_t *coors_map, const int num_input, const int num_feats) {
  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
       x += gridDim.x * blockDim.x) {
    int32_t reduce_to = coors_map[x];

    const int input_offset = x * num_feats;
    const T *feats_offset = feats + input_offset;

    if (reduce_to == -1) {
147
      continue;
148
149
150
151
152
153
154
155
156
    }

    const int reduced_offset = reduce_to * num_feats;
    const T *reduced_feats_offset = reduced_feats + reduced_offset;
    int32_t *reduce_from_offset = reduce_from + reduced_offset;

    for (int i = 0; i < num_feats; i++) {
      if (feats_offset[i] == reduced_feats_offset[i]) {
        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
zhangwenwei's avatar
zhangwenwei committed
157
158
159
160
161
      }
    }
  }
}

162
template <typename T>
163
164
165
166
167
__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
                                               const T *grad_reduced_feats,
                                               const int32_t *reduce_from,
                                               const int num_reduced,
                                               const int num_feats) {
168
169
170
171
172
173
174
175
176
177
178
179
180
  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_reduced;
       x += gridDim.x * blockDim.x) {
    const int reduced_offset = x * num_feats;
    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;

    for (int i = 0; i < num_feats; i++) {
      grad_feats[scatter_to_offset[i] * num_feats + i] =
          grad_reduced_feats_offset[i];
    }
  }
}

zhangwenwei's avatar
zhangwenwei committed
181
182
namespace voxelization {

183
184
185
std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
    const at::Tensor &feats, const at::Tensor &coors,
    const reduce_t reduce_type) {
186
187
  CHECK_INPUT(feats);
  CHECK_INPUT(coors);
zhangwenwei's avatar
zhangwenwei committed
188

189
190
  const int num_input = feats.size(0);
  const int num_feats = feats.size(1);
zhangwenwei's avatar
zhangwenwei committed
191

192
193
194
195
196
  if (num_input == 0)
    return {feats.clone().detach(),
            coors.clone().detach(),
            coors.new_empty({0}, torch::kInt32),
            coors.new_empty({0}, torch::kInt32)};
zhangwenwei's avatar
zhangwenwei committed
197

198
199
200
  at::Tensor out_coors;
  at::Tensor coors_map;
  at::Tensor reduce_count;
zhangwenwei's avatar
zhangwenwei committed
201

202
  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
203

204
205
206
  std::tie(out_coors, coors_map, reduce_count) =
      at::unique_dim(coors_clean, 0, true, true, true);

zhanggefan's avatar
zhanggefan committed
207
208
209
210
211
212
213
214
215
  if (out_coors.index({0, 0}).lt(0).item<bool>()) {
    // the first element of out_coors (-1,-1,-1) and should be removed
    out_coors = out_coors.slice(0, 1);
    reduce_count = reduce_count.slice(0, 1);
    coors_map = coors_map - 1;
  }

  coors_map = coors_map.to(torch::kInt32);
  reduce_count = reduce_count.to(torch::kInt32);
216
217
218

  auto reduced_feats =
      at::empty({out_coors.size(0), num_feats}, feats.options());
219
220
221

  AT_DISPATCH_FLOATING_TYPES(
      feats.scalar_type(), "feats_reduce_kernel", ([&] {
222
223
224
225
226
227
228
229
230
231
232
233
234
235
    if (reduce_type == reduce_t::MAX)
      reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
    else
      reduced_feats.fill_(static_cast<scalar_t>(0));

    dim3 blocks(std::min(at::cuda::ATenCeilDiv(num_input, threadsPerBlock),
                         maxGridDim));
    dim3 threads(threadsPerBlock);
    feats_reduce_kernel<<<blocks, threads>>>(
        feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
        reduced_feats.data_ptr<scalar_t>(), num_input, num_feats, reduce_type);
    if (reduce_type == reduce_t::MEAN)
      reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
  }));
zhangwenwei's avatar
zhangwenwei committed
236
237
  AT_CUDA_CHECK(cudaGetLastError());

238
  return {reduced_feats, out_coors, coors_map, reduce_count};
zhangwenwei's avatar
zhangwenwei committed
239
240
}

241
242
243
244
245
246
247
void dynamic_point_to_voxel_backward_gpu(at::Tensor &grad_feats,
                                         const at::Tensor &grad_reduced_feats,
                                         const at::Tensor &feats,
                                         const at::Tensor &reduced_feats,
                                         const at::Tensor &coors_map,
                                         const at::Tensor &reduce_count,
                                         const reduce_t reduce_type) {
248
249
250
251
252
253
  CHECK_INPUT(grad_feats);
  CHECK_INPUT(grad_reduced_feats);
  CHECK_INPUT(feats);
  CHECK_INPUT(reduced_feats);
  CHECK_INPUT(coors_map);
  CHECK_INPUT(reduce_count);
zhangwenwei's avatar
zhangwenwei committed
254

255
256
257
  const int num_input = feats.size(0);
  const int num_reduced = reduced_feats.size(0);
  const int num_feats = feats.size(1);
zhangwenwei's avatar
zhangwenwei committed
258

259
  grad_feats.fill_(0);
zhangwenwei's avatar
zhangwenwei committed
260
261
  // copy voxel grad to points

262
263
  if (num_input == 0 || num_reduced == 0) return;

264
265
266
267
  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
        ([&] {
268
269
          dim3 blocks(std::min(
              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
270
271
272
273
274
275
276
277
278
          dim3 threads(threadsPerBlock);
          add_reduce_traceback_grad_kernel<<<blocks, threads>>>(
              grad_feats.data_ptr<scalar_t>(),
              grad_reduced_feats.data_ptr<scalar_t>(),
              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
              num_input, num_feats, reduce_type);
        }));
    AT_CUDA_CHECK(cudaGetLastError());
  } else {
279
280
    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
                                coors_map.options().dtype(torch::kInt32));
281
282
283
    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(),
        "max_reduce_traceback_scatter_idx_kernel", ([&] {
284
285
          dim3 blocks(std::min(
              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
286
287
288
289
290
291
292
293
294
295
296
          dim3 threads(threadsPerBlock);
          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads>>>(
              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
              num_input, num_feats);
        }));
    AT_CUDA_CHECK(cudaGetLastError());

    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(),
        "max_reduce_traceback_scatter_idx_kernel", ([&] {
297
298
          dim3 blocks(std::min(
              at::cuda::ATenCeilDiv(num_reduced, threadsPerBlock), maxGridDim));
299
300
301
302
303
304
305
306
          dim3 threads(threadsPerBlock);
          max_reduce_scatter_grad_kernel<<<blocks, threads>>>(
              grad_feats.data_ptr<scalar_t>(),
              grad_reduced_feats.data_ptr<scalar_t>(),
              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
        }));
    AT_CUDA_CHECK(cudaGetLastError());
  }
zhangwenwei's avatar
zhangwenwei committed
307
308
309
  return;
}

310
}  // namespace voxelization