nms_kernel.cu 5.15 KB
Newer Older
1
#include <ATen/ATen.h>
2
#include <ATen/AccumulateType.h>
3
4
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
5
#include <torch/library.h>
6
7
8

#include "cuda_helpers.h"

9
10
11
12
namespace vision {
namespace ops {

namespace {
13
14
15
16

int const threadsPerBlock = sizeof(unsigned long long) * 8;

template <typename T>
17
18
19
20
__device__ inline bool devIoU(
    T const* const a,
    T const* const b,
    const float threshold) {
21
22
23
  T left = max(a[0], b[0]), right = min(a[2], b[2]);
  T top = max(a[1], b[1]), bottom = min(a[3], b[3]);
  T width = max(right - left, (T)0), height = max(bottom - top, (T)0);
24
25
26
27
  using acc_T = at::acc_type<T, /*is_cuda=*/true>;
  acc_T interS = (acc_T)width * height;
  acc_T Sa = ((acc_T)a[2] - a[0]) * (a[3] - a[1]);
  acc_T Sb = ((acc_T)b[2] - b[0]) * (b[3] - b[1]);
28
  return (interS / (Sa + Sb - interS)) > threshold;
29
30
31
}

template <typename T>
32
__global__ void nms_kernel_impl(
Vasilis Vryniotis's avatar
Vasilis Vryniotis committed
33
34
    int n_boxes,
    double iou_threshold,
35
36
37
38
39
    const T* dev_boxes,
    unsigned long long* dev_mask) {
  const int row_start = blockIdx.y;
  const int col_start = blockIdx.x;

40
41
  if (row_start > col_start)
    return;
42
43
44
45
46
47

  const int row_size =
      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
  const int col_size =
      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

48
  __shared__ T block_boxes[threadsPerBlock * 4];
49
  if (threadIdx.x < col_size) {
50
51
52
53
54
55
56
57
    block_boxes[threadIdx.x * 4 + 0] =
        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
    block_boxes[threadIdx.x * 4 + 1] =
        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
    block_boxes[threadIdx.x * 4 + 2] =
        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
    block_boxes[threadIdx.x * 4 + 3] =
        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
58
59
60
61
62
  }
  __syncthreads();

  if (threadIdx.x < row_size) {
    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
63
    const T* cur_box = dev_boxes + cur_box_idx * 4;
64
65
66
67
68
69
70
    int i = 0;
    unsigned long long t = 0;
    int start = 0;
    if (row_start == col_start) {
      start = threadIdx.x + 1;
    }
    for (i = start; i < col_size; i++) {
Yuxin Wu's avatar
Yuxin Wu committed
71
      if (devIoU<T>(cur_box, block_boxes + i * 4, iou_threshold)) {
72
73
74
        t |= 1ULL << i;
      }
    }
75
    const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
76
77
78
79
    dev_mask[cur_box_idx * col_blocks + col_start] = t;
  }
}

80
at::Tensor nms_kernel(
81
    const at::Tensor& dets,
82
    const at::Tensor& scores,
Vasilis Vryniotis's avatar
Vasilis Vryniotis committed
83
    double iou_threshold) {
vfdev's avatar
vfdev committed
84
85
  TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor");
  TORCH_CHECK(scores.is_cuda(), "scores must be a CUDA tensor");
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

  TORCH_CHECK(
      dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
  TORCH_CHECK(
      dets.size(1) == 4,
      "boxes should have 4 elements in dimension 1, got ",
      dets.size(1));
  TORCH_CHECK(
      scores.dim() == 1,
      "scores should be a 1d tensor, got ",
      scores.dim(),
      "D");
  TORCH_CHECK(
      dets.size(0) == scores.size(0),
      "boxes and scores should have same number of elements in ",
      "dimension 0, got ",
      dets.size(0),
      " and ",
      scores.size(0))

106
  at::cuda::CUDAGuard device_guard(dets.device());
107
108
109
110

  if (dets.numel() == 0) {
    return at::empty({0}, dets.options().dtype(at::kLong));
  }
111

112
113
  auto order_t = std::get<1>(
      scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
114
  auto dets_sorted = dets.index_select(0, order_t).contiguous();
115

116
  int dets_num = dets.size(0);
117

118
  const int col_blocks = ceil_div(dets_num, threadsPerBlock);
119
120

  at::Tensor mask =
121
      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
122
123
124
125
126
127

  dim3 blocks(col_blocks, col_blocks);
  dim3 threads(threadsPerBlock);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
128
      dets_sorted.scalar_type(), "nms_kernel", [&] {
129
        nms_kernel_impl<scalar_t><<<blocks, threads, 0, stream>>>(
130
131
            dets_num,
            iou_threshold,
132
133
            dets_sorted.data_ptr<scalar_t>(),
            (unsigned long long*)mask.data_ptr<int64_t>());
134
135
136
      });

  at::Tensor mask_cpu = mask.to(at::kCPU);
137
138
  unsigned long long* mask_host =
      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
139
140
141
142
143

  std::vector<unsigned long long> remv(col_blocks);
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  at::Tensor keep =
144
      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
145
  int64_t* keep_out = keep.data_ptr<int64_t>();
146
147

  int num_to_keep = 0;
148
  for (int i = 0; i < dets_num; i++) {
149
150
151
152
153
154
155
156
157
158
159
160
161
    int nblock = i / threadsPerBlock;
    int inblock = i % threadsPerBlock;

    if (!(remv[nblock] & (1ULL << inblock))) {
      keep_out[num_to_keep++] = i;
      unsigned long long* p = mask_host + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv[j] |= p[j];
      }
    }
  }

  AT_CUDA_CHECK(cudaGetLastError());
Francisco Massa's avatar
Francisco Massa committed
162
163
164
  return order_t.index(
      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
           .to(order_t.device(), keep.scalar_type())});
165
}
166

167
168
169
} // namespace

TORCH_LIBRARY_IMPL(torchvision, CUDA, m) {
170
  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_kernel));
171
172
}

173
174
} // namespace ops
} // namespace vision