Merge pull request #20 from open-mmlab/dev

Initial public release

Merge pull request #20 from open-mmlab/dev
Initial public release
6efefa27 · Kai Chen · GitHub · 2cf13281 · 54b54d88 · 6efefa27
Unverified Commit 6efefa27 authored Oct 12, 2018 by Kai Chen Committed by GitHub Oct 12, 2018
20 changed files
--- a/mmdet/ops/nms/gpu_nms.hpp
+++ b/mmdet/ops/nms/gpu_nms.hpp
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
+size_t nms_Malloc();
--- a/mmdet/ops/nms/gpu_nms.pyx
+++ b/mmdet/ops/nms/gpu_nms.pyx
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+import numpy as np
+cimport numpy as np
+assert sizeof(int) == sizeof(np.int32_t)
+cdef extern from "gpu_nms.hpp":
+    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
+    size_t nms_Malloc() nogil
+memory_pool = {}
+def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
+            np.int32_t device_id=0):
+    cdef int boxes_num = dets.shape[0]
+    cdef int boxes_dim = dets.shape[1]
+    cdef int num_out
+    cdef size_t base
+    cdef np.ndarray[np.int32_t, ndim=1] \
+        keep = np.zeros(boxes_num, dtype=np.int32)
+    cdef np.ndarray[np.float32_t, ndim=1] \
+        scores = dets[:, 4]
+    cdef np.ndarray[np.int_t, ndim=1] \
+        order = scores.argsort()[::-1]
+    cdef np.ndarray[np.float32_t, ndim=2] \
+        sorted_dets = dets[order, :]
+    cdef float cthresh = thresh
+    if device_id not in memory_pool:
+        with nogil:
+            base = nms_Malloc()
+        memory_pool[device_id] = base
+        # print "malloc", base
+    base = memory_pool[device_id]
+    with nogil:
+        _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
+    keep = keep[:num_out]
+    return list(order[keep])
--- a/mmdet/ops/nms/nms_kernel.cu
+++ b/mmdet/ops/nms/nms_kernel.cu
+// ------------------------------------------------------------------
+// Faster R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Shaoqing Ren
+// ------------------------------------------------------------------
+#include <stdio.h>
+#include <iostream>
+#include <vector>
+#include "gpu_nms.hpp"
+#define CUDA_CHECK(condition)                                    \
+    /* Code block avoids redefinition of cudaError_t error */    \
+    do {                                                         \
+        cudaError_t error = condition;                           \
+        if (error != cudaSuccess) {                              \
+            std::cout << cudaGetErrorString(error) << std::endl; \
+        }                                                        \
+    } while (0)
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define MULTIPLIER 16
+#define LONGLONG_SIZE 64
+int const threadsPerBlock =
+    sizeof(unsigned long long) * 8 *
+    MULTIPLIER;  // number of bits for a long long variable
+__device__ inline float devIoU(float const* const a, float const* const b) {
+    float left = max(a[0], b[0]), right = min(a[2], b[2]);
+    float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+    float width = max(right - left + 1, 0.f),
+          height = max(bottom - top + 1, 0.f);
+    float interS = width * height;
+    float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+    float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+    return interS / (Sa + Sb - interS);
+}
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float* dev_boxes,
+                           unsigned long long* dev_mask) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+    // if (row_start > col_start) return;
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+    __shared__ float block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 5 + 0] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+        block_boxes[threadIdx.x * 5 + 1] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+        block_boxes[threadIdx.x * 5 + 2] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+        block_boxes[threadIdx.x * 5 + 3] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+        block_boxes[threadIdx.x * 5 + 4] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+    unsigned long long ts[MULTIPLIER];
+    if (threadIdx.x < row_size) {
+#pragma unroll
+        for (int i = 0; i < MULTIPLIER; ++i) {
+            ts[i] = 0;
+        }
+        const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+        const float* cur_box = dev_boxes + cur_box_idx * 5;
+        int i = 0;
+        int start = 0;
+        if (row_start == col_start) {
+            start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+                ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
+            }
+        }
+        const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+#pragma unroll
+        for (int i = 0; i < MULTIPLIER; ++i) {
+            dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
+                ts[i];
+        }
+    }
+}
+void _set_device(int device_id) {
+    int current_device;
+    CUDA_CHECK(cudaGetDevice(&current_device));
+    if (current_device == device_id) {
+        return;
+    }
+    // The call to cudaSetDevice must come before any calls to Get, which
+    // may perform initialization using the GPU.
+    CUDA_CHECK(cudaSetDevice(device_id));
+}
+const size_t MEMORY_SIZE = 500000000;
+size_t nms_Malloc() {
+    float* boxes_dev = NULL;
+    CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
+    return size_t(boxes_dev);
+}
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
+    _set_device(device_id);
+    float* boxes_dev = NULL;
+    unsigned long long* mask_dev = NULL;
+    const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+    if (base > 0) {
+        size_t require_mem =
+            boxes_num * boxes_dim * sizeof(float) +
+            boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
+        if (require_mem >= MEMORY_SIZE) {
+            std::cout << "require_mem: " << require_mem << std::endl;
+        }
+        boxes_dev = (float*)(base);
+        mask_dev =
+            (unsigned long long*)(base +
+                                  512 * ((unsigned long long)(boxes_num *
+                                                              boxes_dim *
+                                                              sizeof(float) /
+                                                              512) +
+                                         1));
+    } else {
+        CUDA_CHECK(
+            cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
+        CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
+                                             sizeof(unsigned long long)));
+    }
+    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
+                          boxes_num * boxes_dim * sizeof(float),
+                          cudaMemcpyHostToDevice));
+    dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+                DIVUP(boxes_num, threadsPerBlock));
+    dim3 threads(threadsPerBlock);
+    nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev,
+                                    mask_dev);
+    std::vector<unsigned long long> mask_host(boxes_num * col_blocks *
+                                              MULTIPLIER);
+    CUDA_CHECK(cudaMemcpy(
+        &mask_host[0], mask_dev,
+        sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
+        cudaMemcpyDeviceToHost));
+    std::vector<unsigned long long> remv(col_blocks * MULTIPLIER);
+    memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
+    int num_to_keep = 0;
+    for (int i = 0; i < boxes_num; i++) {
+        int nblock = i / threadsPerBlock;
+        int inblock = i % threadsPerBlock;
+        int offset = inblock / LONGLONG_SIZE;
+        int bit_pos = inblock % LONGLONG_SIZE;
+        if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
+            keep_out[num_to_keep++] = i;
+            unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
+            for (int j = nblock * MULTIPLIER + offset;
+                 j < col_blocks * MULTIPLIER; j++) {
+                remv[j] |= p[j];
+            }
+        }
+    }
+    *num_out = num_to_keep;
+    if (!base) {
+        CUDA_CHECK(cudaFree(boxes_dev));
+        CUDA_CHECK(cudaFree(mask_dev));
+    }
+}
--- a/mmdet/ops/nms/nms_wrapper.py
+++ b/mmdet/ops/nms/nms_wrapper.py
+import numpy as np
+import torch
+from .gpu_nms import gpu_nms
+from .cpu_nms import cpu_nms
+from .cpu_soft_nms import cpu_soft_nms
+def nms(dets, thresh, device_id=None):
+    """Dispatch to either CPU or GPU NMS implementations."""
+    if isinstance(dets, torch.Tensor):
+        if dets.is_cuda:
+            device_id = dets.get_device()
+        dets = dets.detach().cpu().numpy()
+    assert isinstance(dets, np.ndarray)
+    if dets.shape[0] == 0:
+        inds = []
+    else:
+        inds = (gpu_nms(dets, thresh, device_id=device_id)
+                if device_id is not None else cpu_nms(dets, thresh))
+    if isinstance(dets, torch.Tensor):
+        return dets.new_tensor(inds, dtype=torch.long)
+    else:
+        return np.array(inds, dtype=np.int)
+def soft_nms(dets, Nt=0.3, method=1, sigma=0.5, min_score=0):
+    if isinstance(dets, torch.Tensor):
+        _dets = dets.detach().cpu().numpy()
+    else:
+        _dets = dets.copy()
+    assert isinstance(_dets, np.ndarray)
+    new_dets, inds = cpu_soft_nms(
+        _dets, Nt=Nt, method=method, sigma=sigma, threshold=min_score)
+    if isinstance(dets, torch.Tensor):
+        return dets.new_tensor(
+            inds, dtype=torch.long), dets.new_tensor(new_dets)
+    else:
+        return np.array(
+            inds, dtype=np.int), np.array(
+                new_dets, dtype=np.float32)
--- a/mmdet/ops/nms/setup.py
+++ b/mmdet/ops/nms/setup.py
+import os
+from distutils.core import setup
+from distutils.extension import Extension
+import numpy as np
+from Cython.Build import cythonize
+from Cython.Distutils import build_ext
+CUDA_ROOT = '/usr/local/cuda'
+CUDA = {
+    "include": os.path.join(CUDA_ROOT, 'include'),
+    "lib": os.path.join(CUDA_ROOT, 'lib64'),
+    "nvcc": os.path.join(CUDA_ROOT, 'bin', "nvcc")
+}
+inc_dirs = [CUDA['include'], np.get_include()]
+lib_dirs = [CUDA['lib']]
+# extensions
+ext_args = dict(
+    include_dirs=inc_dirs,
+    library_dirs=lib_dirs,
+    language='c++',
+    libraries=['cudart'],
+    extra_compile_args={
+        "cc": ['-Wno-unused-function', '-Wno-write-strings'],
+        "nvcc": [
+            '-arch=sm_52', '--ptxas-options=-v', '-c', '--compiler-options',
+            '-fPIC'
+        ],
+    },
+)
+extensions = [
+    Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args),
+    Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
+    Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
+]
+def customize_compiler_for_nvcc(self):
+    """inject deep into distutils to customize how the dispatch
+    to cc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on."""
+    # tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+    # save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+    # now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1 translated
+            # from the extra_compile_args in the Extension class
+            postargs = extra_postargs['nvcc']
+        else:
+            postargs = extra_postargs['cc']
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+    # inject our redefined _compile method into the class
+    self._compile = _compile
+# run the customize_compiler
+class custom_build_ext(build_ext):
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
+setup(
+    name='nms',
+    cmdclass={'build_ext': custom_build_ext},
+    ext_modules=cythonize(extensions),
+)
--- a/mmdet/ops/roi_align/__init__.py
+++ b/mmdet/ops/roi_align/__init__.py
+from .functions.roi_align import roi_align
+from .modules.roi_align import RoIAlign
+__all__ = ['roi_align', 'RoIAlign']
--- a/mmdet/ops/roi_align/functions/__init__.py
+++ b/mmdet/ops/roi_align/functions/__init__.py
--- a/mmdet/ops/roi_align/functions/roi_align.py
+++ b/mmdet/ops/roi_align/functions/roi_align.py
+from torch.autograd import Function, Variable
+from .. import roi_align_cuda
+class RoIAlignFunction(Function):
+    @staticmethod
+    def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        ctx.spatial_scale = spatial_scale
+        ctx.sample_num = sample_num
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+        batch_size, num_channels, data_height, data_width = features.size()
+        num_rois = rois.size(0)
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+        if features.is_cuda:
+            roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale,
+                                   sample_num, output)
+        else:
+            raise NotImplementedError
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        sample_num = ctx.sample_num
+        rois = ctx.saved_tensors[0]
+        assert (feature_size is not None and grad_output.is_cuda)
+        batch_size, num_channels, data_height, data_width = feature_size
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+        grad_input = grad_rois = None
+        if ctx.needs_input_grad[0]:
+            grad_input = Variable(
+                rois.new(batch_size, num_channels, data_height, data_width)
+                .zero_())
+            roi_align_cuda.backward(grad_output, rois, out_h, out_w,
+                                    spatial_scale, sample_num, grad_input)
+        return grad_input, grad_rois, None, None, None
+roi_align = RoIAlignFunction.apply
--- a/mmdet/ops/roi_align/gradcheck.py
+++ b/mmdet/ops/roi_align/gradcheck.py
+import numpy as np
+import torch
+from torch.autograd import gradcheck
+import os.path as osp
+import sys
+sys.path.append(osp.abspath(osp.join(__file__, '../../')))
+from roi_align import RoIAlign  # noqa: E402
+feat_size = 15
+spatial_scale = 1.0 / 8
+img_size = feat_size / spatial_scale
+num_imgs = 2
+num_rois = 20
+batch_ind = np.random.randint(num_imgs, size=(num_rois, 1))
+rois = np.random.rand(num_rois, 4) * img_size * 0.5
+rois[:, 2:] += img_size * 0.5
+rois = np.hstack((batch_ind, rois))
+feat = torch.randn(
+    num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0')
+rois = torch.from_numpy(rois).float().cuda()
+inputs = (feat, rois)
+print('Gradcheck for roi align...')
+test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3)
+print(test)
+test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3)
+print(test)
--- a/mmdet/ops/roi_align/modules/__init__.py
+++ b/mmdet/ops/roi_align/modules/__init__.py
--- a/mmdet/ops/roi_align/modules/roi_align.py
+++ b/mmdet/ops/roi_align/modules/roi_align.py
+from torch.nn.modules.module import Module
+from ..functions.roi_align import RoIAlignFunction
+class RoIAlign(Module):
+    def __init__(self, out_size, spatial_scale, sample_num=0):
+        super(RoIAlign, self).__init__()
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.sample_num = int(sample_num)
+    def forward(self, features, rois):
+        return RoIAlignFunction.apply(features, rois, self.out_size,
+                                      self.spatial_scale, self.sample_num)
--- a/mmdet/ops/roi_align/setup.py
+++ b/mmdet/ops/roi_align/setup.py
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+setup(
+    name='roi_align_cuda',
+    ext_modules=[
+        CUDAExtension('roi_align_cuda', [
+            'src/roi_align_cuda.cpp',
+            'src/roi_align_kernel.cu',
+        ]),
+    ],
+    cmdclass={'build_ext': BuildExtension})
--- a/mmdet/ops/roi_align/src/roi_align_cuda.cpp
+++ b/mmdet/ops/roi_align/src/roi_align_cuda.cpp
+#include <torch/torch.h>
+#include <cmath>
+#include <vector>
+int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                           const float spatial_scale, const int sample_num,
+                           const int channels, const int height,
+                           const int width, const int num_rois,
+                           const int pooled_height, const int pooled_width,
+                           at::Tensor output);
+int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                            const float spatial_scale, const int sample_num,
+                            const int channels, const int height,
+                            const int width, const int num_rois,
+                            const int pooled_height, const int pooled_width,
+                            at::Tensor bottom_grad);
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+int roi_align_forward_cuda(at::Tensor features, at::Tensor rois,
+                           int pooled_height, int pooled_width,
+                           float spatial_scale, int sample_num,
+                           at::Tensor output) {
+  CHECK_INPUT(features);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(output);
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+  int num_channels = features.size(1);
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num,
+                         num_channels, data_height, data_width, num_rois,
+                         pooled_height, pooled_width, output);
+  return 1;
+}
+int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois,
+                            int pooled_height, int pooled_width,
+                            float spatial_scale, int sample_num,
+                            at::Tensor bottom_grad) {
+  CHECK_INPUT(top_grad);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(bottom_grad);
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num,
+                          num_channels, data_height, data_width, num_rois,
+                          pooled_height, pooled_width, bottom_grad);
+  return 1;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)");
+  m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)");
+}
--- a/mmdet/ops/roi_align/src/roi_align_kernel.cu
+++ b/mmdet/ops/roi_align/src/roi_align_kernel.cu
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+using namespace at;  // temporal fix for pytorch<=0.4.1 (see #9848)
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+#define THREADS_PER_BLOCK 1024
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 65000;
+  return min(optimal_block_num, max_block_num);
+}
+template <typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
+                                         const int height, const int width,
+                                         scalar_t y, scalar_t x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+  // do bilinear interpolation
+  scalar_t lt = bottom_data[y_low * width + x_low];
+  scalar_t rt = bottom_data[y_low * width + x_high];
+  scalar_t lb = bottom_data[y_high * width + x_low];
+  scalar_t rb = bottom_data[y_high * width + x_high];
+  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
+  return val;
+}
+template <typename scalar_t>
+__global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
+                                const scalar_t *bottom_rois,
+                                const scalar_t spatial_scale,
+                                const int sample_num, const int channels,
+                                const int height, const int width,
+                                const int pooled_height, const int pooled_width,
+                                scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the aligned output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
+    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
+    // Force malformed ROIs to be 1x1
+    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
+    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
+    scalar_t bin_size_h = roi_height / pooled_height;
+    scalar_t bin_size_w = roi_width / pooled_width;
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+    int sample_num_h = (sample_num > 0)
+                           ? sample_num
+                           : ceil(roi_height / pooled_height);  // e.g., = 2
+    int sample_num_w =
+        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
+    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
+    int hstart = fminf(floor(h), height - 2);
+    int wstart = fminf(floor(w), width - 2);
+    scalar_t output_val = 0;
+    for (int iy = 0; iy < sample_num_h; iy++) {
+      const scalar_t y = roi_start_h + ph * bin_size_h +
+                         (scalar_t)(iy + scalar_t(.5f)) * bin_size_h /
+                             (scalar_t)(sample_num_h);
+      for (int ix = 0; ix < sample_num_w; ix++) {
+        const scalar_t x = roi_start_w + pw * bin_size_w +
+                           (scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
+                               (scalar_t)(sample_num_w);
+        scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data,
+                                                      height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= (sample_num_h * sample_num_w);
+    top_data[index] = output_val;
+  }
+}
+int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                           const float spatial_scale, const int sample_num,
+                           const int channels, const int height,
+                           const int width, const int num_rois,
+                           const int pooled_height, const int pooled_width,
+                           at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.type(), "ROIAlignLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *top_data = output.data<scalar_t>();
+        ROIAlignForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                sample_num, channels, height, width, pooled_height,
+                pooled_width, top_data);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+  return 1;
+}
+template <typename scalar_t>
+__device__ void bilinear_interpolate_gradient(const int height, const int width,
+                                              scalar_t y, scalar_t x,
+                                              scalar_t &w1, scalar_t &w2,
+                                              scalar_t &w3, scalar_t &w4,
+                                              int &x_low, int &x_high,
+                                              int &y_low, int &y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+  y_low = (int)y;
+  x_low = (int)x;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  return;
+}
+template <typename scalar_t>
+__global__ void ROIAlignBackward(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int sample_num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the aligned output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
+    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
+    // Force malformed ROIs to be 1x1
+    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
+    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
+    scalar_t bin_size_h = roi_height / pooled_height;
+    scalar_t bin_size_w = roi_width / pooled_width;
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+    int offset_top = (n * channels + c) * pooled_height * pooled_width +
+                     ph * pooled_width + pw;
+    scalar_t offset_top_diff = top_diff[offset_top];
+    int sample_num_h = (sample_num > 0)
+                           ? sample_num
+                           : ceil(roi_height / pooled_height);  // e.g., = 2
+    int sample_num_w =
+        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+    const scalar_t count = (scalar_t)(sample_num_h * sample_num_w);
+    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
+    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
+    int hstart = fminf(floor(h), height - 2);
+    int wstart = fminf(floor(w), width - 2);
+    for (int iy = 0; iy < sample_num_h; iy++) {
+      const scalar_t y =
+          roi_start_h + ph * bin_size_h +
+          (scalar_t)(iy + .5f) * bin_size_h / (scalar_t)(sample_num_h);
+      for (int ix = 0; ix < sample_num_w; ix++) {
+        const scalar_t x =
+            roi_start_w + pw * bin_size_w +
+            (scalar_t)(ix + .5f) * bin_size_w / (scalar_t)(sample_num_w);
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient<scalar_t>(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+        scalar_t g1 = offset_top_diff * w1 / count;
+        scalar_t g2 = offset_top_diff * w2 / count;
+        scalar_t g3 = offset_top_diff * w3 / count;
+        scalar_t g4 = offset_top_diff * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }
+      }
+    }
+  }
+}
+int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                            const float spatial_scale, const int sample_num,
+                            const int channels, const int height,
+                            const int width, const int num_rois,
+                            const int pooled_height, const int pooled_width,
+                            at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  // TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+        if (sizeof(scalar_t) == sizeof(double)) {
+          fprintf(stderr, "double is not supported\n");
+          exit(-1);
+        }
+        ROIAlignBackward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, spatial_scale, sample_num,
+                channels, height, width, pooled_height, pooled_width,
+                bottom_diff);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+  return 1;
+}
--- a/mmdet/ops/roi_pool/__init__.py
+++ b/mmdet/ops/roi_pool/__init__.py
+from .functions.roi_pool import roi_pool
+from .modules.roi_pool import RoIPool
+__all__ = ['roi_pool', 'RoIPool']
--- a/mmdet/ops/roi_pool/functions/__init__.py
+++ b/mmdet/ops/roi_pool/functions/__init__.py
--- a/mmdet/ops/roi_pool/functions/roi_pool.py
+++ b/mmdet/ops/roi_pool/functions/roi_pool.py
+import torch
+from torch.autograd import Function
+from .. import roi_pool_cuda
+class RoIPoolFunction(Function):
+    @staticmethod
+    def forward(ctx, features, rois, out_size, spatial_scale):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        assert features.is_cuda
+        ctx.save_for_backward(rois)
+        num_channels = features.size(1)
+        num_rois = rois.size(0)
+        out_size = (num_rois, num_channels, out_h, out_w)
+        output = features.new_zeros(*out_size)
+        argmax = features.new_zeros(*out_size, dtype=torch.int)
+        roi_pool_cuda.forward(features, rois, out_h, out_w, spatial_scale,
+                              output, argmax)
+        ctx.spatial_scale = spatial_scale
+        ctx.feature_size = features.size()
+        ctx.argmax = argmax
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+        spatial_scale = ctx.spatial_scale
+        feature_size = ctx.feature_size
+        argmax = ctx.argmax
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        grad_input = grad_rois = None
+        if ctx.needs_input_grad[0]:
+            grad_input = grad_output.new(feature_size).zero_()
+            roi_pool_cuda.backward(grad_output, rois, argmax, spatial_scale,
+                                   grad_input)
+        return grad_input, grad_rois, None, None
+roi_pool = RoIPoolFunction.apply
--- a/mmdet/ops/roi_pool/gradcheck.py
+++ b/mmdet/ops/roi_pool/gradcheck.py
+import torch
+from torch.autograd import gradcheck
+import os.path as osp
+import sys
+sys.path.append(osp.abspath(osp.join(__file__, '../../')))
+from roi_pool import RoIPool  # noqa: E402
+feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda()
+rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55],
+                     [1, 67, 40, 110, 120]]).cuda()
+inputs = (feat, rois)
+print('Gradcheck for roi pooling...')
+test = gradcheck(RoIPool(4, 1.0 / 8), inputs, eps=1e-5, atol=1e-3)
+print(test)
--- a/mmdet/ops/roi_pool/modules/__init__.py
+++ b/mmdet/ops/roi_pool/modules/__init__.py
--- a/mmdet/ops/roi_pool/modules/roi_pool.py
+++ b/mmdet/ops/roi_pool/modules/roi_pool.py
+from torch.nn.modules.module import Module
+from ..functions.roi_pool import roi_pool
+class RoIPool(Module):
+    def __init__(self, out_size, spatial_scale):
+        super(RoIPool, self).__init__()
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+    def forward(self, features, rois):
+        return roi_pool(features, rois, self.out_size, self.spatial_scale)