iou3d.cpp 6.66 KB
Newer Older
Wenwei Zhang's avatar
Wenwei Zhang committed
1
2
3
4
5
6
7
8
9
// Modified from
// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp

/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/

zhangwenwei's avatar
zhangwenwei committed
10
11
#include <cuda.h>
#include <cuda_runtime_api.h>
12
13
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
zhangwenwei's avatar
zhangwenwei committed
14

15
#include <cstdint>
16
#include <vector>
zhangwenwei's avatar
zhangwenwei committed
17

18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#define CHECK_CUDA(x) \
  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

#define CHECK_ERROR(ans) \
  { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
                      bool abort = true) {
  if (code != cudaSuccess) {
    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
            line);
    if (abort) exit(code);
  }
zhangwenwei's avatar
zhangwenwei committed
37
38
39
40
}

const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
void boxesoverlapLauncher(const int num_a, const float *boxes_a,
                          const int num_b, const float *boxes_b,
                          float *ans_overlap);
void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
                         const float *boxes_b, float *ans_iou);
void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
                 float nms_overlap_thresh);
void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
                       int boxes_num, float nms_overlap_thresh);

int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
                          at::Tensor ans_overlap) {
  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)

  CHECK_INPUT(boxes_a);
  CHECK_INPUT(boxes_b);
  CHECK_INPUT(ans_overlap);

  int num_a = boxes_a.size(0);
  int num_b = boxes_b.size(0);

  const float *boxes_a_data = boxes_a.data_ptr<float>();
  const float *boxes_b_data = boxes_b.data_ptr<float>();
  float *ans_overlap_data = ans_overlap.data_ptr<float>();

  boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
                       ans_overlap_data);

  return 1;
zhangwenwei's avatar
zhangwenwei committed
72
73
}

74
75
76
77
78
int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
                      at::Tensor ans_iou) {
  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
zhangwenwei's avatar
zhangwenwei committed
79

80
81
82
  CHECK_INPUT(boxes_a);
  CHECK_INPUT(boxes_b);
  CHECK_INPUT(ans_iou);
zhangwenwei's avatar
zhangwenwei committed
83

84
85
  int num_a = boxes_a.size(0);
  int num_b = boxes_b.size(0);
zhangwenwei's avatar
zhangwenwei committed
86

87
88
89
  const float *boxes_a_data = boxes_a.data_ptr<float>();
  const float *boxes_b_data = boxes_b.data_ptr<float>();
  float *ans_iou_data = ans_iou.data_ptr<float>();
zhangwenwei's avatar
zhangwenwei committed
90

91
  boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
zhangwenwei's avatar
zhangwenwei committed
92

93
  return 1;
zhangwenwei's avatar
zhangwenwei committed
94
95
}

96
97
int nms_gpu(at::Tensor boxes, at::Tensor keep,
	    float nms_overlap_thresh, int device_id) {
98
99
  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
  // params keep: (N)
zhangwenwei's avatar
zhangwenwei committed
100

101
102
  CHECK_INPUT(boxes);
  CHECK_CONTIGUOUS(keep);
103
  cudaSetDevice(device_id);
zhangwenwei's avatar
zhangwenwei committed
104

105
106
  int boxes_num = boxes.size(0);
  const float *boxes_data = boxes.data_ptr<float>();
107
  int64_t *keep_data = keep.data_ptr<int64_t>();
zhangwenwei's avatar
zhangwenwei committed
108

109
  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
zhangwenwei's avatar
zhangwenwei committed
110

111
112
113
114
  unsigned long long *mask_data = NULL;
  CHECK_ERROR(cudaMalloc((void **)&mask_data,
                         boxes_num * col_blocks * sizeof(unsigned long long)));
  nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
zhangwenwei's avatar
zhangwenwei committed
115

116
117
118
119
  // unsigned long long mask_cpu[boxes_num * col_blocks];
  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
  // col_blocks];
  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
zhangwenwei's avatar
zhangwenwei committed
120

121
122
123
124
  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
                         boxes_num * col_blocks * sizeof(unsigned long long),
                         cudaMemcpyDeviceToHost));
zhangwenwei's avatar
zhangwenwei committed
125

126
  cudaFree(mask_data);
zhangwenwei's avatar
zhangwenwei committed
127

128
  unsigned long long *remv_cpu = new unsigned long long[col_blocks]();
zhangwenwei's avatar
zhangwenwei committed
129

130
  int num_to_keep = 0;
zhangwenwei's avatar
zhangwenwei committed
131

132
133
134
  for (int i = 0; i < boxes_num; i++) {
    int nblock = i / THREADS_PER_BLOCK_NMS;
    int inblock = i % THREADS_PER_BLOCK_NMS;
zhangwenwei's avatar
zhangwenwei committed
135

136
137
138
139
140
141
    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
      keep_data[num_to_keep++] = i;
      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv_cpu[j] |= p[j];
      }
zhangwenwei's avatar
zhangwenwei committed
142
    }
143
  }
144
  delete[] remv_cpu;
145
  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
zhangwenwei's avatar
zhangwenwei committed
146

147
  return num_to_keep;
zhangwenwei's avatar
zhangwenwei committed
148
149
}

150
int nms_normal_gpu(at::Tensor boxes, at::Tensor keep,
151
                   float nms_overlap_thresh, int device_id) {
152
153
  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
  // params keep: (N)
zhangwenwei's avatar
zhangwenwei committed
154

155
156
  CHECK_INPUT(boxes);
  CHECK_CONTIGUOUS(keep);
157
  cudaSetDevice(device_id);
zhangwenwei's avatar
zhangwenwei committed
158

159
160
  int boxes_num = boxes.size(0);
  const float *boxes_data = boxes.data_ptr<float>();
161
  int64_t *keep_data = keep.data_ptr<int64_t>();
zhangwenwei's avatar
zhangwenwei committed
162

163
  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
zhangwenwei's avatar
zhangwenwei committed
164

165
166
167
168
  unsigned long long *mask_data = NULL;
  CHECK_ERROR(cudaMalloc((void **)&mask_data,
                         boxes_num * col_blocks * sizeof(unsigned long long)));
  nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
zhangwenwei's avatar
zhangwenwei committed
169

170
171
172
173
  // unsigned long long mask_cpu[boxes_num * col_blocks];
  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
  // col_blocks];
  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
zhangwenwei's avatar
zhangwenwei committed
174

175
176
177
178
  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
                         boxes_num * col_blocks * sizeof(unsigned long long),
                         cudaMemcpyDeviceToHost));
zhangwenwei's avatar
zhangwenwei committed
179

180
  cudaFree(mask_data);
zhangwenwei's avatar
zhangwenwei committed
181

182
  unsigned long long *remv_cpu = new unsigned long long[col_blocks]();
zhangwenwei's avatar
zhangwenwei committed
183

184
  int num_to_keep = 0;
zhangwenwei's avatar
zhangwenwei committed
185

186
187
188
  for (int i = 0; i < boxes_num; i++) {
    int nblock = i / THREADS_PER_BLOCK_NMS;
    int inblock = i % THREADS_PER_BLOCK_NMS;
zhangwenwei's avatar
zhangwenwei committed
189

190
191
192
193
194
195
    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
      keep_data[num_to_keep++] = i;
      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv_cpu[j] |= p[j];
      }
zhangwenwei's avatar
zhangwenwei committed
196
    }
197
  }
198
  delete[] remv_cpu;
199
  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
zhangwenwei's avatar
zhangwenwei committed
200

201
  return num_to_keep;
zhangwenwei's avatar
zhangwenwei committed
202
203
204
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
205
206
  m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu,
        "oriented boxes overlap");
zhangwenwei's avatar
zhangwenwei committed
207
208
209
210
  m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
  m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
  m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");
}