nms_cuda signature update (#945)

Updated nms_cuda signature to accept detections and scores as separate tensors. This also required updating the indexing in the NMS CUDA kernel. Also made the iou_threshold parameter name consistent across implementations.

nms_cuda signature update (#945)
Updated nms_cuda signature to accept detections and scores as separate tensors. This also required updating the indexing in the NMS CUDA kernel. Also made the iou_threshold parameter name consistent across implementations.
249cfbf5 · Varun Agrawal · Francisco Massa · b3cb86f2 · 249cfbf5 · 249cfbf5
Commit 249cfbf5 authored May 23, 2019 by Varun Agrawal Committed by Francisco Massa May 23, 2019
5 changed files
--- a/torchvision/csrc/cpu/nms_cpu.cpp
+++ b/torchvision/csrc/cpu/nms_cpu.cpp
@@ -4,7 +4,7 @@ template <typename scalar_t>
 at::Tensor nms_cpu_kernel(
    const at::Tensor& dets,
    const at::Tensor& scores,
-    const float threshold) {
+    const float iou_threshold) {
  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
  AT_ASSERTM(
@@ -61,7 +61,7 @@ at::Tensor nms_cpu_kernel(
      auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1);
      auto inter = w * h;
      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr >= threshold)
+      if (ovr >= iou_threshold)
        suppressed[j] = 1;
    }
  }
@@ -71,11 +71,11 @@ at::Tensor nms_cpu_kernel(
 at::Tensor nms_cpu(
    const at::Tensor& dets,
    const at::Tensor& scores,
-    const float threshold) {
+    const float iou_threshold) {
  auto result = at::empty({0}, dets.options());

  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
-    result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
+    result = nms_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
  });
  return result;
 }
--- a/torchvision/csrc/cpu/vision.h
+++ b/torchvision/csrc/cpu/vision.h
@@ -43,4 +43,4 @@ at::Tensor ROIAlign_backward_cpu(
 at::Tensor nms_cpu(
    const at::Tensor& dets,
    const at::Tensor& scores,
-    const float threshold);
+    const float iou_threshold);
--- a/torchvision/csrc/cuda/nms_cuda.cu
+++ b/torchvision/csrc/cuda/nms_cuda.cu
@@ -24,7 +24,7 @@ __device__ inline float devIoU(T const* const a, T const* const b) {
 template <typename T>
 __global__ void nms_kernel(
    const int n_boxes,
-    const float nms_overlap_thresh,
+    const float iou_threshold,
    const T* dev_boxes,
    unsigned long long* dev_mask) {
  const int row_start = blockIdx.y;
@@ -37,24 +37,22 @@ __global__ void nms_kernel(
  const int col_size =
      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

-  __shared__ T block_boxes[threadsPerBlock * 5];
+  __shared__ T block_boxes[threadsPerBlock * 4];
  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
  }
  __syncthreads();

  if (threadIdx.x < row_size) {
    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
-    const T* cur_box = dev_boxes + cur_box_idx * 5;
+    const T* cur_box = dev_boxes + cur_box_idx * 4;
    int i = 0;
    unsigned long long t = 0;
    int start = 0;
@@ -62,7 +60,7 @@ __global__ void nms_kernel(
      start = threadIdx.x + 1;
    }
    for (i = start; i < col_size; i++) {
-      if (devIoU<T>(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+      if (devIoU<T>(cur_box, block_boxes + i * 4) > iou_threshold) {
        t |= 1ULL << i;
      }
    }
@@ -71,33 +69,34 @@ __global__ void nms_kernel(
  }
 }

-// boxes is a N x 5 tensor
-at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
+at::Tensor nms_cuda(const at::Tensor& dets,
+    const at::Tensor& scores,
+    float iou_threshold) {
  using scalar_t = float;
-  AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
-  at::cuda::CUDAGuard device_guard(boxes.device());
+  AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());

-  auto scores = boxes.select(1, 4);
  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-  auto boxes_sorted = boxes.index_select(0, order_t);
+  auto dets_sorted = dets.index_select(0, order_t);

-  int boxes_num = boxes.size(0);
+  int dets_num = dets.size(0);

-  const int col_blocks = at::cuda::ATenCeilDiv(boxes_num, threadsPerBlock);
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);

  at::Tensor mask =
-      at::empty({boxes_num * col_blocks}, boxes.options().dtype(at::kLong));
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));

  dim3 blocks(col_blocks, col_blocks);
  dim3 threads(threadsPerBlock);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      boxes_sorted.type(), "nms_kernel_cuda", [&] {
+      dets_sorted.type(), "nms_kernel_cuda", [&] {
        nms_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-            boxes_num,
-            nms_overlap_thresh,
-            boxes_sorted.data<scalar_t>(),
+            dets_num,
+            iou_threshold,
+            dets_sorted.data<scalar_t>(),
            (unsigned long long*)mask.data<int64_t>());
      });

@@ -108,11 +107,11 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  at::Tensor keep =
-      at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
  int64_t* keep_out = keep.data<int64_t>();

  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
+  for (int i = 0; i < dets_num; i++) {
    int nblock = i / threadsPerBlock;
    int inblock = i % threadsPerBlock;


--- a/torchvision/csrc/cuda/vision.h
+++ b/torchvision/csrc/cuda/vision.h
@@ -41,4 +41,7 @@ at::Tensor ROIPool_backward_cuda(
    const int height,
    const int width);

-at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
+at::Tensor nms_cuda(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold);
--- a/torchvision/csrc/nms.h
+++ b/torchvision/csrc/nms.h
@@ -8,20 +8,19 @@
 at::Tensor nms(
    const at::Tensor& dets,
    const at::Tensor& scores,
-    const float threshold) {
+    const float iou_threshold) {
  if (dets.device().is_cuda()) {
 #ifdef WITH_CUDA
    if (dets.numel() == 0) {
      at::cuda::CUDAGuard device_guard(dets.device());
      return at::empty({0}, dets.options().dtype(at::kLong));
    }
-    auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
-    return nms_cuda(b, threshold);
+    return nms_cuda(dets, scores, iou_threshold);
 #else
    AT_ERROR("Not compiled with GPU support");
 #endif
  }

-  at::Tensor result = nms_cpu(dets, scores, threshold);
+  at::Tensor result = nms_cpu(dets, scores, iou_threshold);
  return result;
 }