fix cpp header error (#371)

* 1. use macro USE_PARROTS control header include 2. add clang-format google style in pre-commit * use MMCV_ macros

fix cpp header error (#371)
* 1. use macro USE_PARROTS control header include 2. add clang-format google style in pre-commit * use MMCV_ macros
d9549fba · zhuyuanhao · GitHub · 2c6fc5fd · d9549fba · d9549fba
Unverified Commit d9549fba authored Jun 29, 2020 by zhuyuanhao Committed by GitHub Jun 29, 2020
20 changed files
--- a/mmcv/ops/csrc/pytorch/deform_conv.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_conv.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
                                         Tensor offset, Tensor output,
                                         Tensor columns, Tensor ones, int kW,
@@ -62,7 +62,7 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
                         int dilationW, int dilationH, int group,
                         int deformable_group, int im2col_step) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(weight);
@@ -88,7 +88,7 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
                                int dilationW, int dilationH, int group,
                                int deformable_group, int im2col_step) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(gradOutput);
@@ -117,7 +117,7 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
                                     int deformable_group, float scale,
                                     int im2col_step) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(gradOutput);

--- a/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                            Tensor offset, Tensor output,
                                            int pooled_height, int pooled_width,
@@ -38,7 +38,7 @@ void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                             float spatial_scale, int sampling_ratio,
                             float gamma) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(rois);
    CHECK_CUDA_INPUT(offset);
@@ -61,7 +61,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                              int pooled_width, float spatial_scale,
                              int sampling_ratio, float gamma) {
  if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(grad_output);
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(rois);

--- a/mmcv/ops/csrc/pytorch/focal_loss.cpp
+++ b/mmcv/ops/csrc/pytorch/focal_loss.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
@@ -54,7 +54,7 @@ void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
 void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(target);
    CHECK_CUDA_INPUT(weight);
@@ -73,7 +73,7 @@ void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
 void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor grad_input, float gamma, float alpha) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(target);
    CHECK_CUDA_INPUT(weight);
@@ -92,7 +92,7 @@ void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
 void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(target);
    CHECK_CUDA_INPUT(weight);
@@ -112,7 +112,7 @@ void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor buff, Tensor grad_input, float gamma,
                                 float alpha) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(target);
    CHECK_CUDA_INPUT(weight);

--- a/mmcv/ops/csrc/pytorch/focal_loss_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/focal_loss_cuda.cu
 #include "pytorch_cuda_helper.hpp"
-#include "sigmoid_focal_loss_kernel.cuh"
+#include "sigmoid_focal_loss_cuda_kernel.cuh"
-#include "softmax_focal_loss_kernel.cuh"
+#include "softmax_focal_loss_cuda_kernel.cuh"
 void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
                                               Tensor weight, Tensor output,

--- a/mmcv/ops/csrc/pytorch/info.cpp
+++ b/mmcv/ops/csrc/pytorch/info.cpp
@@ -2,13 +2,13 @@
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 #include <cuda_runtime_api.h>
 int get_cudart_version() { return CUDART_VERSION; }
 #endif
 std::string get_compiling_cuda_version() {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
  std::ostringstream oss;
  // copied from
  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231

--- a/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
@@ -39,7 +39,7 @@ void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                           const int kernel_h, const int kernel_w,
                           const int pad_h, const int pad_w) {
  if (im.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(im);
    CHECK_CUDA_INPUT(mask_h_idx);
    CHECK_CUDA_INPUT(mask_w_idx);
@@ -58,7 +58,7 @@ void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor im, int height,
                           int width, int channels) {
  if (col.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(col);
    CHECK_CUDA_INPUT(mask_h_idx);
    CHECK_CUDA_INPUT(mask_w_idx);

--- a/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void ModulatedDeformConvForwardCUDAKernelLauncher(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
@@ -50,7 +50,7 @@ void modulated_deform_conv_forward(
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(weight);
    CHECK_CUDA_INPUT(bias);
@@ -80,7 +80,7 @@ void modulated_deform_conv_backward(
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(weight);
    CHECK_CUDA_INPUT(bias);

--- a/mmcv/ops/csrc/pytorch/nms.cpp
+++ b/mmcv/ops/csrc/pytorch/nms.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                             int offset);
@@ -62,7 +62,7 @@ Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
 Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  if (boxes.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(boxes);
    CHECK_CUDA_INPUT(scores);
    return nms_cuda(boxes, scores, iou_threshold, offset);

--- a/mmcv/ops/csrc/pytorch/nms_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/nms_cuda.cu
-#include "nms_kernel.cuh"
+#include "nms_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
 Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,

--- a/mmcv/ops/csrc/pytorch/psamask.cpp
+++ b/mmcv/ops/csrc/pytorch/psamask.cpp
@@ -182,7 +182,7 @@ void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
                                grad_input);
 }
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
                                      Tensor output, const int num_,
                                      const int h_feature, const int w_feature,
@@ -221,7 +221,7 @@ void psamask_forward(const Tensor input, Tensor output, const int psa_type,
                     const int h_mask, const int w_mask, const int half_h_mask,
                     const int half_w_mask) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(output);
    psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
@@ -240,7 +240,7 @@ void psamask_backward(Tensor grad_output, const Tensor grad_input,
                      const int w_feature, const int h_mask, const int w_mask,
                      const int half_h_mask, const int half_w_mask) {
  if (grad_input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(grad_input);
    CHECK_CUDA_INPUT(grad_output);
    psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,

--- a/mmcv/ops/csrc/pytorch/roi_align.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,
                                       int aligned_height, int aligned_width,
@@ -40,7 +40,7 @@ void roi_align_forward(Tensor input, Tensor rois, Tensor output,
                       int aligned_width, float spatial_scale,
                       int sampling_ratio, int pool_mode, bool aligned) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(rois);
    CHECK_CUDA_INPUT(output);
@@ -63,7 +63,7 @@ void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
                        int aligned_width, float spatial_scale,
                        int sampling_ratio, int pool_mode, bool aligned) {
  if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(grad_output);
    CHECK_CUDA_INPUT(rois);
    CHECK_CUDA_INPUT(argmax_y);

--- a/mmcv/ops/csrc/pytorch/roi_align_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/roi_align_cuda.cu
 #include "pytorch_cuda_helper.hpp"
-#include "roi_align_kernel.cuh"
+#include "roi_align_cuda_kernel.cuh"
 void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,

--- a/mmcv/ops/csrc/pytorch/roi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_pool.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                      Tensor argmax, int pooled_height,
                                      int pooled_width, float spatial_scale);
@@ -29,7 +29,7 @@ void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
                      int pooled_height, int pooled_width,
                      float spatial_scale) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(rois);
    CHECK_CUDA_INPUT(output);
@@ -49,7 +49,7 @@ void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
                       Tensor grad_input, int pooled_height, int pooled_width,
                       float spatial_scale) {
  if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(grad_output);
    CHECK_CUDA_INPUT(rois);
    CHECK_CUDA_INPUT(argmax);

--- a/mmcv/ops/csrc/pytorch/roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/roi_pool_cuda.cu
 #include "pytorch_cuda_helper.hpp"
-#include "roi_pool_kernel.cuh"
+#include "roi_pool_cuda_kernel.cuh"
 void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                      Tensor argmax, int pooled_height,

--- a/mmcv/ops/csrc/pytorch/sync_bn.cpp
+++ b/mmcv/ops/csrc/pytorch/sync_bn.cpp
 #include "pytorch_cpp_helper.hpp"
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
 void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
@@ -61,7 +61,7 @@ void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
 void sync_bn_forward_mean(const Tensor input, Tensor mean) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(mean);
    sync_bn_forward_mean_cuda(input, mean);
@@ -75,7 +75,7 @@ void sync_bn_forward_mean(const Tensor input, Tensor mean) {
 void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(mean);
    CHECK_CUDA_INPUT(var);
@@ -95,7 +95,7 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
                            Tensor output, float eps, float momentum,
                            int group_size) {
  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(mean);
    CHECK_CUDA_INPUT(var);
@@ -120,7 +120,7 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
 void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
                            Tensor grad_weight, Tensor grad_bias) {
  if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(grad_output);
    CHECK_CUDA_INPUT(norm);
    CHECK_CUDA_INPUT(grad_weight);
@@ -139,7 +139,7 @@ void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
                           const Tensor norm, const Tensor std,
                           Tensor grad_input) {
  if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(grad_output);
    CHECK_CUDA_INPUT(weight);
    CHECK_CUDA_INPUT(grad_weight);

--- a/mmcv/ops/csrc/roi_align_kernel.cuh
+++ b/mmcv/ops/csrc/roi_align_kernel.cuh
-#ifndef ROI_ALIGN_KERNEL_CUH
+#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
-#define ROI_ALIGN_KERNEL_CUH
+#define ROI_ALIGN_CUDA_KERNEL_CUH
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 /*** Forward ***/
 template <typename T>
@@ -196,4 +202,4 @@ __global__ void roi_align_backward_cuda_kernel(
  }
 }
-#endif  // ROI_ALIGN_KERNEL_CUH
+#endif  // ROI_ALIGN_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/roi_pool_kernel.cuh
+++ b/mmcv/ops/csrc/roi_pool_kernel.cuh
-#ifndef ROI_POOL_KERNEL_CUH
+#ifndef ROI_POOL_CUDA_KERNEL_CUH
-#define ROI_POOL_KERNEL_CUH
+#define ROI_POOL_CUDA_KERNEL_CUH
-#include <cuda.h>
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 template <typename T>
 __global__ void roi_pool_forward_cuda_kernel(
@@ -85,4 +89,4 @@ __global__ void roi_pool_backward_cuda_kernel(
  }
 }
-#endif
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/sigmoid_focal_loss_kernel.cuh
+++ b/mmcv/ops/csrc/sigmoid_focal_loss_kernel.cuh
-#ifndef SIGMOID_FOCAL_LOSS_KERNEL_CUH
+#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
-#define SIGMOID_FOCAL_LOSS_KERNEL_CUH
+#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 template <typename T>
 __global__ void sigmoid_focal_loss_forward_cuda_kernel(
@@ -60,4 +66,5 @@ __global__ void sigmoid_focal_loss_backward_cuda_kernel(
    }
  }
 }
-#endif
+#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/softmax_focal_loss_kernel.cuh
+++ b/mmcv/ops/csrc/softmax_focal_loss_kernel.cuh
-#ifndef SOFTMAX_FOCAL_LOSS_KERNEL_CUH
+#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
-#define SOFTMAX_FOCAL_LOSS_KERNEL_CUH
+#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 template <typename T>
 __global__ void softmax_focal_loss_forward_cuda_kernel(
@@ -61,4 +67,5 @@ __global__ void softmax_focal_loss_backward_cuda2_kernel(
    }
  }
 }
-#endif
+#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/softnms_kernel.cuh
+++ b/mmcv/ops/csrc/softnms_kernel.cuh
-#ifndef SOFTNMS_KERNEL_CUH
-#define SOFTNMS_KERNEL_CUH
-#include <cuda.h>
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long int) * 8;
-template <typename scalar_t>
-__device__ inline scalar_t devIoU(scalar_t const *const a,
-                                  scalar_t const *const b) {
-  scalar_t left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
-  scalar_t top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
-  scalar_t width = fmaxf(right - left + 1, 0.f),
-           height = fmaxf(bottom - top + 1, 0.f);
-  scalar_t interS = width * height;
-  scalar_t Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  scalar_t Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-template <typename scalar_t>
-__global__ void softnms_max_kernel(const int n_boxes,
-                                   const scalar_t overlap_thresh,
-                                   const scalar_t *dev_boxes, int *order,
-                                   float *max_value, int *max_index) {
-  __shared__ float maximum[threadsPerBlock];
-  __shared__ int max_id[threadsPerBlock];
-  unsigned int tid = threadIdx.x;
-  unsigned int idx = blockIdx.x * threadsPerBlock + threadIdx.x;
-  if (idx >= n_boxes) {
-    return;
-  }
-  const int block_size = fminf(n_boxes + tid - idx, threadsPerBlock);
-  int *l_order = order + (idx - tid);
-  if (l_order[tid] == 0 && dev_boxes[idx * 5 + 4] >= overlap_thresh) {
-    maximum[tid] = dev_boxes[idx * 5 + 4];
-  } else {
-    maximum[tid] = -1.0;
-  }
-  max_id[tid] = tid;
-  __syncthreads();
-  if (block_size >= 1024 && tid < 512) {
-    if (maximum[tid] < maximum[tid + 512]) {
-      maximum[tid] = maximum[tid + 512];
-      max_id[tid] = max_id[tid + 512];
-    }
-  }
-  if (block_size >= 512 && tid < 256) {
-    if (maximum[tid] < maximum[tid + 256]) {
-      maximum[tid] = maximum[tid + 256];
-      max_id[tid] = max_id[tid + 256];
-    }
-  }
-  if (block_size >= 256 && tid < 128) {
-    if (maximum[tid] < maximum[tid + 128]) {
-      maximum[tid] = maximum[tid + 128];
-      max_id[tid] = max_id[tid + 128];
-    }
-  }
-  if (block_size >= 128 && tid < 64) {
-    if (maximum[tid] < maximum[tid + 64]) {
-      maximum[tid] = maximum[tid + 64];
-      max_id[tid] = max_id[tid + 64];
-    }
-  }
-  if (tid < 32) {
-    volatile float *vmaximum = maximum;
-    volatile int *vmax_id = max_id;
-    if (block_size >= 64 && vmaximum[tid] < vmaximum[tid + 32]) {
-      vmaximum[tid] = vmaximum[tid + 32];
-      vmax_id[tid] = vmax_id[tid + 32];
-    }
-    if (block_size >= 32 && tid < 16 && vmaximum[tid] < vmaximum[tid + 16]) {
-      vmaximum[tid] = vmaximum[tid + 16];
-      vmax_id[tid] = vmax_id[tid + 16];
-    }
-    if (block_size >= 16 && tid < 8 && vmaximum[tid] < vmaximum[tid + 8]) {
-      vmaximum[tid] = vmaximum[tid + 8];
-      vmax_id[tid] = vmax_id[tid + 8];
-    }
-    if (block_size >= 8 && tid < 4 && vmaximum[tid] < vmaximum[tid + 4]) {
-      vmaximum[tid] = vmaximum[tid + 4];
-      vmax_id[tid] = vmax_id[tid + 4];
-    }
-    if (block_size >= 4 && tid < 2 && vmaximum[tid] < vmaximum[tid + 2]) {
-      vmaximum[tid] = vmaximum[tid + 2];
-      vmax_id[tid] = vmax_id[tid + 2];
-    }
-    if (block_size >= 2 && tid < 1 && vmaximum[tid] < vmaximum[tid + 1]) {
-      vmaximum[tid] = vmaximum[tid + 1];
-      vmax_id[tid] = vmax_id[tid + 1];
-    }
-  }
-  if (tid == 0) {
-    max_value[blockIdx.x] = maximum[0];
-    max_index[blockIdx.x] = max_id[0];
-  }
-}
-template <typename scalar_t>
-__global__ void softnms_update_kernel(const int n_boxes, const scalar_t sigma,
-                                      const scalar_t n_thresh,
-                                      const unsigned int method,
-                                      const scalar_t overlap_thresh,
-                                      scalar_t *dev_boxes, int *order,
-                                      unsigned long long *keep, int max_id) {
-  const int col_start = blockIdx.x;
-  const int col_size =
-      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-  const int cur_idx = threadsPerBlock * col_start + threadIdx.x;
-  const int tid = threadIdx.x;
-  if (cur_idx >= n_boxes) {
-    return;
-  }
-  __shared__ scalar_t cur_max_boxes[5];
-  cur_max_boxes[0] = dev_boxes[max_id * 5 + 0];
-  cur_max_boxes[1] = dev_boxes[max_id * 5 + 1];
-  cur_max_boxes[2] = dev_boxes[max_id * 5 + 2];
-  cur_max_boxes[3] = dev_boxes[max_id * 5 + 3];
-  cur_max_boxes[4] = dev_boxes[max_id * 5 + 4];
-  __syncthreads();
-  if (cur_idx != max_id && tid < col_size && order[cur_idx] == 0 &&
-      (!(keep[col_start] & (1ULL << tid)))) {
-    scalar_t block_boxes[5];
-    block_boxes[0] = dev_boxes[cur_idx * 5 + 0];
-    block_boxes[1] = dev_boxes[cur_idx * 5 + 1];
-    block_boxes[2] = dev_boxes[cur_idx * 5 + 2];
-    block_boxes[3] = dev_boxes[cur_idx * 5 + 3];
-    block_boxes[4] = dev_boxes[cur_idx * 5 + 4];
-    scalar_t ovr = devIoU(cur_max_boxes, block_boxes);
-    scalar_t weight = 1.0;
-    if (method == 1) {
-      if (ovr > n_thresh) {
-        weight = 1.0 - ovr;
-      }
-    } else if (method == 2) {
-      weight = exp(-(ovr * ovr) / sigma);
-    } else if (ovr >= n_thresh) {
-      weight = 0.0;
-    }
-    block_boxes[4] *= weight;
-    dev_boxes[cur_idx * 5 + 4] = block_boxes[4];
-    if (block_boxes[4] < overlap_thresh) {
-      keep[col_start] |= 1ULL << tid;
-    }
-  }
-}
-#endif