Speed up nms_cuda (#1704)

1. Let the IOU function compare with threshold. This avoid a division. Similar strategy is also used in https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/non_max_suppression_op.cu.cc 2. Only compute the upper triangle of the mask. This speeds up the kernel about 20% (tested on GTX 1080Ti, with 20 input cases dumped from a Mask R-CNN inference job).

Speed up nms_cuda (#1704)
1. Let the IOU function compare with threshold. This avoid a division. Similar strategy is also used in https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/non_max_suppression_op.cu.cc 2. Only compute the upper triangle of the mask. This speeds up the kernel about 20% (tested on GTX 1080Ti, with 20 input cases dumped from a Mask R-CNN inference job).
06cbdb5b · Yuxin Wu · Francisco Massa · 2a174229 · 06cbdb5b
Commit 06cbdb5b authored Jan 02, 2020 by Yuxin Wu Committed by Francisco Massa Jan 02, 2020
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

torchvision/csrc/cuda/nms_cuda.cu torchvision/csrc/cuda/nms_cuda.cu +4 -4

No files found.
--- a/torchvision/csrc/cuda/nms_cuda.cu
+++ b/torchvision/csrc/cuda/nms_cuda.cu
@@ -11,14 +11,14 @@
 int const threadsPerBlock = sizeof(unsigned long long) * 8;

 template <typename T>
-__device__ inline float devIoU(T const* const a, T const* const b) {
+__device__ inline bool devIoU(T const* const a, T const* const b, const float threshold) {
  T left = max(a[0], b[0]), right = min(a[2], b[2]);
  T top = max(a[1], b[1]), bottom = min(a[3], b[3]);
  T width = max(right - left, (T)0), height = max(bottom - top, (T)0);
  T interS = width * height;
  T Sa = (a[2] - a[0]) * (a[3] - a[1]);
  T Sb = (b[2] - b[0]) * (b[3] - b[1]);
-  return interS / (Sa + Sb - interS);
+  return interS > threshold * (Sa + Sb - interS);
 }

 template <typename T>
@@ -30,7 +30,7 @@ __global__ void nms_kernel(
  const int row_start = blockIdx.y;
  const int col_start = blockIdx.x;

-  // if (row_start > col_start) return;
+  if (row_start > col_start) return;

  const int row_size =
      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
@@ -60,7 +60,7 @@ __global__ void nms_kernel(
      start = threadIdx.x + 1;
    }
    for (i = start; i < col_size; i++) {
-      if (devIoU<T>(cur_box, block_boxes + i * 4) > iou_threshold) {
+      if (devIoU<T>(cur_box, block_boxes + i * 4, iou_threshold)) {
        t |= 1ULL << i;
      }
    }