nms.cu 5.58 KB
Newer Older
traveller59's avatar
traveller59 committed
1
2
3
4
// ------------------------------------------------------------------
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
5
6
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
traveller59's avatar
traveller59 committed
7
8
// ------------------------------------------------------------------
#include <cuda_runtime.h>
9
#include <iostream>
traveller59's avatar
traveller59 committed
10
#include <spconv/nms_gpu.h>
11
#include <vector>
traveller59's avatar
traveller59 committed
12

13
14
15
16
17
18
19
#define CUDA_CHECK(condition)                                                  \
  /* Code block avoids redefinition of cudaError_t error */                    \
  do {                                                                         \
    cudaError_t error = condition;                                             \
    if (error != cudaSuccess) {                                                \
      std::cout << cudaGetErrorString(error) << std::endl;                     \
    }                                                                          \
traveller59's avatar
traveller59 committed
20
21
22
23
24
25
  } while (0)

#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long) * 8;

template <typename DType>
26
__device__ inline DType devIoU(DType const *const a, DType const *const b) {
traveller59's avatar
traveller59 committed
27
28
29
30
31
32
33
34
35
36
37
  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
  DType interS = width * height;
  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
  return interS / (Sa + Sb - interS);
}

template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
38
39
                           const DType *dev_boxes,
                           unsigned long long *dev_mask) {
traveller59's avatar
traveller59 committed
40
41
42
43
44
  const int row_start = blockIdx.y;
  const int col_start = blockIdx.x;

  // if (row_start > col_start) return;

45
46
  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
traveller59's avatar
traveller59 committed
47
48

  __shared__ DType block_boxes[BLOCK_THREADS * 5];
49
  if (threadIdx.x < col_size) {
traveller59's avatar
traveller59 committed
50
#pragma unroll
51
    for (int i = 0; i < 5; ++i) {
traveller59's avatar
traveller59 committed
52
53
54
55
56
57
      block_boxes[threadIdx.x * 5 + i] =
          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
    }
  }
  __syncthreads();

58
  if (threadIdx.x < row_size) {
traveller59's avatar
traveller59 committed
59
60
61
62
    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
    const DType *cur_box = dev_boxes + cur_box_idx * 5;
    unsigned long long t = 0;
    int start = 0;
63
    if (row_start == col_start) {
traveller59's avatar
traveller59 committed
64
65
      start = threadIdx.x + 1;
    }
66
67
    for (int i = start; i < col_size; i++) {
      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
traveller59's avatar
traveller59 committed
68
69
70
71
72
73
74
75
        t |= 1ULL << i;
      }
    }
    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
    dev_mask[cur_box_idx * col_blocks + col_start] = t;
  }
}

76
void _set_device(int device_id) {
traveller59's avatar
traveller59 committed
77
78
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
79
  if (current_device == device_id) {
traveller59's avatar
traveller59 committed
80
81
82
83
84
85
86
87
88
    return;
  }
  // The call to cudaSetDevice must come before any calls to Get, which
  // may perform initialization using the GPU.
  CUDA_CHECK(cudaSetDevice(device_id));
}

template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
89
             int boxes_dim, DType nms_overlap_thresh, int device_id) {
traveller59's avatar
traveller59 committed
90
91
92
93
94
95
96
  _set_device(device_id);

  DType *boxes_dev = NULL;
  unsigned long long *mask_dev = NULL;

  const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);

97
98
  CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
  CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
traveller59's avatar
traveller59 committed
99
100
101
102
103
104
                        boxes_num * boxes_dim * sizeof(DType),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMalloc(&mask_dev,
                        boxes_num * col_blocks * sizeof(unsigned long long)));

105
  dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
traveller59's avatar
traveller59 committed
106
  dim3 threads(BLOCK_THREADS);
107
108
  nms_kernel<DType, BLOCK_THREADS>
      <<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);
traveller59's avatar
traveller59 committed
109
110

  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
111
  CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
traveller59's avatar
traveller59 committed
112
113
114
115
116
117
118
                        sizeof(unsigned long long) * boxes_num * col_blocks,
                        cudaMemcpyDeviceToHost));

  std::vector<unsigned long long> remv(col_blocks);
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  int num_to_keep = 0;
119
  for (int i = 0; i < boxes_num; i++) {
traveller59's avatar
traveller59 committed
120
121
122
    int nblock = i / BLOCK_THREADS;
    int inblock = i % BLOCK_THREADS;

123
    if (!(remv[nblock] & (1ULL << inblock))) {
traveller59's avatar
traveller59 committed
124
125
      keep_out[num_to_keep++] = i;
      unsigned long long *p = &mask_host[0] + i * col_blocks;
126
      for (int j = nblock; j < col_blocks; j++) {
traveller59's avatar
traveller59 committed
127
128
129
130
131
132
133
134
135
136
        remv[j] |= p[j];
      }
    }
  }

  CUDA_CHECK(cudaFree(boxes_dev));
  CUDA_CHECK(cudaFree(mask_dev));
  return num_to_keep;
}

137
138
139
140
141
142
143
144
145
146
147
148
// template<>
template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
                                              const float *boxes_host,
                                              int boxes_num, int boxes_dim,
                                              float nms_overlap_thresh,
                                              int device_id);
// template<>
template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
                                               const double *boxes_host,
                                               int boxes_num, int boxes_dim,
                                               double nms_overlap_thresh,
                                               int device_id);