three_nn_cuda.cu 2.34 KB
Newer Older
wuyuefeng's avatar
wuyuefeng committed
1
2
3
4
5
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define THREADS_PER_BLOCK 256
zhangwenwei's avatar
zhangwenwei committed
6
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
wuyuefeng's avatar
wuyuefeng committed
7

zhangwenwei's avatar
zhangwenwei committed
8
9
10
11
12
13
14
15
16
17
__global__ void three_nn_kernel(int b, int n, int m,
                                const float *__restrict__ unknown,
                                const float *__restrict__ known,
                                float *__restrict__ dist2,
                                int *__restrict__ idx) {
  // unknown: (B, N, 3)
  // known: (B, M, 3)
  // output:
  //      dist2: (B, N, 3)
  //      idx: (B, N, 3)
wuyuefeng's avatar
wuyuefeng committed
18

zhangwenwei's avatar
zhangwenwei committed
19
20
21
  int bs_idx = blockIdx.y;
  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (bs_idx >= b || pt_idx >= n) return;
wuyuefeng's avatar
wuyuefeng committed
22

zhangwenwei's avatar
zhangwenwei committed
23
24
25
26
  unknown += bs_idx * n * 3 + pt_idx * 3;
  known += bs_idx * m * 3;
  dist2 += bs_idx * n * 3 + pt_idx * 3;
  idx += bs_idx * n * 3 + pt_idx * 3;
wuyuefeng's avatar
wuyuefeng committed
27

zhangwenwei's avatar
zhangwenwei committed
28
29
30
  float ux = unknown[0];
  float uy = unknown[1];
  float uz = unknown[2];
wuyuefeng's avatar
wuyuefeng committed
31

zhangwenwei's avatar
zhangwenwei committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
  int besti1 = 0, besti2 = 0, besti3 = 0;
  for (int k = 0; k < m; ++k) {
    float x = known[k * 3 + 0];
    float y = known[k * 3 + 1];
    float z = known[k * 3 + 2];
    float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
    if (d < best1) {
      best3 = best2;
      besti3 = besti2;
      best2 = best1;
      besti2 = besti1;
      best1 = d;
      besti1 = k;
    } else if (d < best2) {
      best3 = best2;
      besti3 = besti2;
      best2 = d;
      besti2 = k;
    } else if (d < best3) {
      best3 = d;
      besti3 = k;
wuyuefeng's avatar
wuyuefeng committed
54
    }
zhangwenwei's avatar
zhangwenwei committed
55
56
57
58
59
60
61
  }
  dist2[0] = best1;
  dist2[1] = best2;
  dist2[2] = best3;
  idx[0] = besti1;
  idx[1] = besti2;
  idx[2] = besti3;
wuyuefeng's avatar
wuyuefeng committed
62
63
64
}

void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
zhangwenwei's avatar
zhangwenwei committed
65
66
67
68
69
70
71
                              const float *known, float *dist2, int *idx,
                              cudaStream_t stream) {
  // unknown: (B, N, 3)
  // known: (B, M, 3)
  // output:
  //      dist2: (B, N, 3)
  //      idx: (B, N, 3)
wuyuefeng's avatar
wuyuefeng committed
72

zhangwenwei's avatar
zhangwenwei committed
73
74
75
76
  cudaError_t err;
  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
              b);  // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);
wuyuefeng's avatar
wuyuefeng committed
77

zhangwenwei's avatar
zhangwenwei committed
78
79
  three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known,
                                                  dist2, idx);
wuyuefeng's avatar
wuyuefeng committed
80

zhangwenwei's avatar
zhangwenwei committed
81
82
83
84
85
  err = cudaGetLastError();
  if (cudaSuccess != err) {
    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
    exit(-1);
  }
wuyuefeng's avatar
wuyuefeng committed
86
}