group_points_cuda.cu 3.79 KB
Newer Older
wuyuefeng's avatar
Credit  
wuyuefeng committed
1
2
3
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu

wuyuefeng's avatar
wuyuefeng committed
4
5
6
7
#include <stdio.h>
#include <stdlib.h>

#define THREADS_PER_BLOCK 256
zhangwenwei's avatar
zhangwenwei committed
8
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
wuyuefeng's avatar
wuyuefeng committed
9

zhangwenwei's avatar
zhangwenwei committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
                                         int nsample,
                                         const float *__restrict__ grad_out,
                                         const int *__restrict__ idx,
                                         float *__restrict__ grad_points) {
  // grad_out: (B, C, npoints, nsample)
  // idx: (B, npoints, nsample)
  // output:
  //      grad_points: (B, C, N)
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int pt_idx = index / nsample;
  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
wuyuefeng's avatar
wuyuefeng committed
24

zhangwenwei's avatar
zhangwenwei committed
25
26
27
28
  int sample_idx = index % nsample;
  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
              pt_idx * nsample + sample_idx;
  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
wuyuefeng's avatar
wuyuefeng committed
29

zhangwenwei's avatar
zhangwenwei committed
30
  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
wuyuefeng's avatar
wuyuefeng committed
31
32
}

zhangwenwei's avatar
zhangwenwei committed
33
34
35
36
37
38
39
40
41
42
43
44
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
                                       int nsample, const float *grad_out,
                                       const int *idx, float *grad_points,
                                       cudaStream_t stream) {
  // grad_out: (B, C, npoints, nsample)
  // idx: (B, npoints, nsample)
  // output:
  //      grad_points: (B, C, N)
  cudaError_t err;
  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);
wuyuefeng's avatar
wuyuefeng committed
45

zhangwenwei's avatar
zhangwenwei committed
46
47
  group_points_grad_kernel<<<blocks, threads, 0, stream>>>(
      b, c, n, npoints, nsample, grad_out, idx, grad_points);
wuyuefeng's avatar
wuyuefeng committed
48

zhangwenwei's avatar
zhangwenwei committed
49
50
51
52
53
  err = cudaGetLastError();
  if (cudaSuccess != err) {
    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
    exit(-1);
  }
wuyuefeng's avatar
wuyuefeng committed
54
55
}

zhangwenwei's avatar
zhangwenwei committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
__global__ void group_points_kernel(int b, int c, int n, int npoints,
                                    int nsample,
                                    const float *__restrict__ points,
                                    const int *__restrict__ idx,
                                    float *__restrict__ out) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int pt_idx = index / nsample;
  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
wuyuefeng's avatar
wuyuefeng committed
70

zhangwenwei's avatar
zhangwenwei committed
71
  int sample_idx = index % nsample;
wuyuefeng's avatar
wuyuefeng committed
72

zhangwenwei's avatar
zhangwenwei committed
73
74
75
76
  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
                pt_idx * nsample + sample_idx;
wuyuefeng's avatar
wuyuefeng committed
77

zhangwenwei's avatar
zhangwenwei committed
78
  out[out_idx] = points[in_idx];
wuyuefeng's avatar
wuyuefeng committed
79
80
81
}

void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
zhangwenwei's avatar
zhangwenwei committed
82
83
84
85
86
87
88
89
90
91
                                  const float *points, const int *idx,
                                  float *out, cudaStream_t stream) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)
  cudaError_t err;
  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);
wuyuefeng's avatar
wuyuefeng committed
92

zhangwenwei's avatar
zhangwenwei committed
93
94
95
96
97
98
99
100
  group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample,
                                                      points, idx, out);
  // cudaDeviceSynchronize();  // for using printf in kernel function
  err = cudaGetLastError();
  if (cudaSuccess != err) {
    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
    exit(-1);
  }
wuyuefeng's avatar
wuyuefeng committed
101
}