set up the codebase skeleton (WIP)

108fc9e1 · Kai Chen · 6985ef31 · 108fc9e1 · 108fc9e1 · 108fc9e1
Commit 108fc9e1 authored Sep 02, 2018 by Kai Chen
12 changed files
--- a/mmdet/ops/roi_align/src/roi_align_kernel.cu
+++ b/mmdet/ops/roi_align/src/roi_align_kernel.cu
+#include <ATen/ATen.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <vector>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;                   \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 1024
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 65000;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
+                                         const int height, const int width,
+                                         scalar_t y, scalar_t x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+  // do bilinear interpolation
+  scalar_t lt = bottom_data[y_low * width + x_low];
+  scalar_t rt = bottom_data[y_low * width + x_high];
+  scalar_t lb = bottom_data[y_high * width + x_low];
+  scalar_t rb = bottom_data[y_high * width + x_high];
+  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
+
+  return val;
+}
+
+template <typename scalar_t>
+__global__ void
+ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
+                const scalar_t *bottom_rois, const scalar_t spatial_scale,
+                const int sample_num, const int channels, const int height,
+                const int width, const int pooled_height,
+                const int pooled_width, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the aligned output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
+    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
+    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
+
+    scalar_t bin_size_h = roi_height / pooled_height;
+    scalar_t bin_size_w = roi_width / pooled_width;
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    int sample_num_h = (sample_num > 0)
+                           ? sample_num
+                           : ceil(roi_height / pooled_height); // e.g., = 2
+    int sample_num_w =
+        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
+    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
+
+    int hstart = fminf(floor(h), height - 2);
+    int wstart = fminf(floor(w), width - 2);
+
+    scalar_t output_val = 0;
+    for (int iy = 0; iy < sample_num_h; iy++) {
+      const scalar_t y = roi_start_h + ph * bin_size_h +
+                         (scalar_t)(iy + scalar_t(.5f)) * bin_size_h /
+                             (scalar_t)(sample_num_h);
+      for (int ix = 0; ix < sample_num_w; ix++) {
+        const scalar_t x = roi_start_w + pw * bin_size_w +
+                           (scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
+                               (scalar_t)(sample_num_w);
+        scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data,
+                                                      height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= (sample_num_h * sample_num_w);
+    top_data[index] = output_val;
+  }
+}
+
+int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                           const float spatial_scale, const int sample_num,
+                           const int channels, const int height,
+                           const int width, const int num_rois,
+                           const int pooled_height, const int pooled_width,
+                           at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES(
+      features.type(), "ROIAlignLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *top_data = output.data<scalar_t>();
+
+        ROIAlignForward<
+            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+            output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+            sample_num, channels, height, width, pooled_height, pooled_width,
+            top_data);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
+
+template <typename scalar_t>
+__device__ void
+bilinear_interpolate_gradient(const int height, const int width, scalar_t y,
+                              scalar_t x, scalar_t &w1, scalar_t &w2,
+                              scalar_t &w3, scalar_t &w4, int &x_low,
+                              int &x_high, int &y_low, int &y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename scalar_t>
+__global__ void
+ROIAlignBackward(const int nthreads, const scalar_t *top_diff,
+                 const scalar_t *bottom_rois, const scalar_t spatial_scale,
+                 const int sample_num, const int channels, const int height,
+                 const int width, const int pooled_height,
+                 const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the aligned output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
+    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
+    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
+
+    scalar_t bin_size_h = roi_height / pooled_height;
+    scalar_t bin_size_w = roi_width / pooled_width;
+
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+    int offset_top = (n * channels + c) * pooled_height * pooled_width +
+                     ph * pooled_width + pw;
+    scalar_t offset_top_diff = top_diff[offset_top];
+
+    int sample_num_h = (sample_num > 0)
+                           ? sample_num
+                           : ceil(roi_height / pooled_height); // e.g., = 2
+    int sample_num_w =
+        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    const scalar_t count = (scalar_t)(sample_num_h * sample_num_w);
+
+    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
+    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
+
+    int hstart = fminf(floor(h), height - 2);
+    int wstart = fminf(floor(w), width - 2);
+
+    for (int iy = 0; iy < sample_num_h; iy++) {
+      const scalar_t y =
+          roi_start_h + ph * bin_size_h +
+          (scalar_t)(iy + .5f) * bin_size_h / (scalar_t)(sample_num_h);
+      for (int ix = 0; ix < sample_num_w; ix++) {
+        const scalar_t x =
+            roi_start_w + pw * bin_size_w +
+            (scalar_t)(ix + .5f) * bin_size_w / (scalar_t)(sample_num_w);
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+        scalar_t g1 = offset_top_diff * w1 / count;
+        scalar_t g2 = offset_top_diff * w2 / count;
+        scalar_t g3 = offset_top_diff * w3 / count;
+        scalar_t g4 = offset_top_diff * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }
+      }
+    }
+  }
+}
+
+template <>
+__global__ void ROIAlignBackward<double>(
+    const int nthreads, const double *top_diff, const double *bottom_rois,
+    const double spatial_scale, const int sample_num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, double *bottom_diff) {}
+
+int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                            const float spatial_scale, const int sample_num,
+                            const int channels, const int height,
+                            const int width, const int num_rois,
+                            const int pooled_height, const int pooled_width,
+                            at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+        if (sizeof(scalar_t) == sizeof(double)) {
+          fprintf(stderr, "double is not supported\n");
+          exit(-1);
+        }
+
+        ROIAlignBackward<
+            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+            output_size, top_diff, rois_data, spatial_scale, sample_num,
+            channels, height, width, pooled_height, pooled_width, bottom_diff);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
--- a/mmdet/ops/roi_pool/__init__.py
+++ b/mmdet/ops/roi_pool/__init__.py
+from .functions.roi_pool import roi_pool
+from .modules.roi_pool import RoIPool
--- a/mmdet/ops/roi_pool/functions/__init__.py
+++ b/mmdet/ops/roi_pool/functions/__init__.py
--- a/mmdet/ops/roi_pool/functions/roi_pool.py
+++ b/mmdet/ops/roi_pool/functions/roi_pool.py
+import torch
+from torch.autograd import Function
+
+from .. import roi_pool_cuda
+
+
+class RoIPoolFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, rois, out_size, spatial_scale):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        assert features.is_cuda
+        ctx.save_for_backward(rois)
+        num_channels = features.size(1)
+        num_rois = rois.size(0)
+        out_size = (num_rois, num_channels, out_h, out_w)
+        output = features.new_zeros(*out_size)
+
+        argmax = features.new_zeros(*out_size, dtype=torch.int)
+        roi_pool_cuda.forward(features, rois, out_h, out_w, spatial_scale,
+                              output, argmax)
+        ctx.spatial_scale = spatial_scale
+        ctx.feature_size = features.size()
+        ctx.argmax = argmax
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+        spatial_scale = ctx.spatial_scale
+        feature_size = ctx.feature_size
+        argmax = ctx.argmax
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+
+        grad_input = grad_rois = None
+        if ctx.needs_input_grad[0]:
+            grad_input = grad_output.new(feature_size).zero_()
+            roi_pool_cuda.backward(grad_output, rois, argmax, spatial_scale,
+                                   grad_input)
+
+        return grad_input, grad_rois, None, None
+
+
+roi_pool = RoIPoolFunction.apply
--- a/mmdet/ops/roi_pool/gradcheck.py
+++ b/mmdet/ops/roi_pool/gradcheck.py
+import torch
+from torch.autograd import gradcheck
+
+import os.path as osp
+import sys
+sys.path.append(osp.abspath(osp.join(__file__, '../../')))
+from roi_pooling import RoIPool
+
+feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda()
+rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55],
+                     [1, 67, 40, 110, 120]]).cuda()
+inputs = (feat, rois)
+print('Gradcheck for roi pooling...')
+test = gradcheck(RoIPool(4, 1.0 / 8), inputs, eps=1e-5, atol=1e-3)
+print(test)
--- a/mmdet/ops/roi_pool/modules/__init__.py
+++ b/mmdet/ops/roi_pool/modules/__init__.py
--- a/mmdet/ops/roi_pool/modules/roi_pool.py
+++ b/mmdet/ops/roi_pool/modules/roi_pool.py
+from torch.nn.modules.module import Module
+from ..functions.roi_pool import roi_pool
+
+
+class RoIPool(Module):
+
+    def __init__(self, out_size, spatial_scale):
+        super(RoIPool, self).__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, features, rois):
+        return roi_pool(features, rois, self.out_size, self.spatial_scale)
--- a/mmdet/ops/roi_pool/setup.py
+++ b/mmdet/ops/roi_pool/setup.py
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='roi_pool',
+    ext_modules=[
+        CUDAExtension('roi_pool_cuda', [
+            'src/roi_pool_cuda.cpp',
+            'src/roi_pool_kernel.cu',
+        ])
+    ],
+    cmdclass={'build_ext': BuildExtension})
--- a/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
+++ b/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
+#include <torch/torch.h>
+
+#include <cmath>
+#include <vector>
+
+int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                          const float spatial_scale, const int channels,
+                          const int height, const int width, const int num_rois,
+                          const int pooled_h, const int pooled_w,
+                          at::Tensor output, at::Tensor argmax);
+
+int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                           const at::Tensor argmax, const float spatial_scale,
+                           const int batch_size, const int channels,
+                           const int height, const int width,
+                           const int num_rois, const int pooled_h,
+                           const int pooled_w, at::Tensor bottom_grad);
+
+#define CHECK_CUDA(x) AT_ASSERT(x.type().is_cuda(), #x " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERT(x.is_contiguous(), #x " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+int roi_pooling_forward_cuda(at::Tensor features, at::Tensor rois,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale, at::Tensor output,
+                             at::Tensor argmax) {
+  CHECK_INPUT(features);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(output);
+  CHECK_INPUT(argmax);
+
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+
+  int channels = features.size(1);
+  int height = features.size(2);
+  int width = features.size(3);
+
+  ROIPoolForwardLaucher(features, rois, spatial_scale, channels, height, width,
+                        num_rois, pooled_height, pooled_width, output, argmax);
+
+  return 1;
+}
+
+int roi_pooling_backward_cuda(at::Tensor top_grad, at::Tensor rois,
+                              at::Tensor argmax, float spatial_scale,
+                              at::Tensor bottom_grad) {
+  CHECK_INPUT(top_grad);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(argmax);
+  CHECK_INPUT(bottom_grad);
+
+  int pooled_height = top_grad.size(2);
+  int pooled_width = top_grad.size(3);
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+  int batch_size = bottom_grad.size(0);
+  int channels = bottom_grad.size(1);
+  int height = bottom_grad.size(2);
+  int width = bottom_grad.size(3);
+
+  ROIPoolBackwardLaucher(top_grad, rois, argmax, spatial_scale, batch_size,
+                         channels, height, width, num_rois, pooled_height,
+                         pooled_width, bottom_grad);
+
+  return 1;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &roi_pooling_forward_cuda, "Roi_Pooling forward (CUDA)");
+  m.def("backward", &roi_pooling_backward_cuda, "Roi_Pooling backward (CUDA)");
+}
--- a/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
+++ b/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
+#include <ATen/ATen.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <vector>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;                   \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 1024
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 65000;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename scalar_t>
+__global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
+                               const scalar_t *rois,
+                               const scalar_t spatial_scale, const int channels,
+                               const int height, const int width,
+                               const int pooled_h, const int pooled_w,
+                               scalar_t *top_data, int *argmax_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_w;
+    int ph = (index / pooled_w) % pooled_h;
+    int c = (index / pooled_w / pooled_h) % channels;
+    int n = index / pooled_w / pooled_h / channels;
+
+    const scalar_t *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    // calculate the roi region on feature maps
+    scalar_t roi_x1 = offset_rois[1] * spatial_scale;
+    scalar_t roi_y1 = offset_rois[2] * spatial_scale;
+    scalar_t roi_x2 = (offset_rois[3] + 1) * spatial_scale;
+    scalar_t roi_y2 = (offset_rois[4] + 1) * spatial_scale;
+
+    // force malformed rois to be 1x1
+    scalar_t roi_w = roi_x2 - roi_x1;
+    scalar_t roi_h = roi_y2 - roi_y1;
+    if (roi_w <= 0 || roi_h <= 0)
+      continue;
+
+    scalar_t bin_size_w = roi_w / static_cast<scalar_t>(pooled_w);
+    scalar_t bin_size_h = roi_h / static_cast<scalar_t>(pooled_h);
+
+    // the corresponding bin region
+    int bin_x1 = floor(static_cast<scalar_t>(pw) * bin_size_w + roi_x1);
+    int bin_y1 = floor(static_cast<scalar_t>(ph) * bin_size_h + roi_y1);
+    int bin_x2 = ceil(static_cast<scalar_t>(pw + 1) * bin_size_w + roi_x1);
+    int bin_y2 = ceil(static_cast<scalar_t>(ph + 1) * bin_size_h + roi_y1);
+
+    // add roi offsets and clip to input boundaries
+    bin_x1 = min(max(bin_x1, 0), width);
+    bin_y1 = min(max(bin_y1, 0), height);
+    bin_x2 = min(max(bin_x2, 0), width);
+    bin_y2 = min(max(bin_y2, 0), height);
+    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
+
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int max_idx = -1;
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+
+    // Define an empty pooling region to be zero
+    scalar_t max_val = is_empty ? 0 : bottom_data[bin_y1 * width + bin_x1] - 1;
+
+    for (int h = bin_y1; h < bin_y2; ++h) {
+      for (int w = bin_x1; w < bin_x2; ++w) {
+        int offset = h * width + w;
+        if (bottom_data[offset] > max_val) {
+          max_val = bottom_data[offset];
+          max_idx = offset;
+        }
+      }
+    }
+    top_data[index] = max_val;
+    if (argmax_data != NULL)
+      argmax_data[index] = max_idx;
+  }
+}
+
+int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                          const float spatial_scale, const int channels,
+                          const int height, const int width, const int num_rois,
+                          const int pooled_h, const int pooled_w,
+                          at::Tensor output, at::Tensor argmax) {
+  const int output_size = num_rois * channels * pooled_h * pooled_w;
+
+  AT_DISPATCH_FLOATING_TYPES(
+      features.type(), "ROIPoolLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *top_data = output.data<scalar_t>();
+        int *argmax_data = argmax.data<int>();
+
+        ROIPoolForward<
+            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+            output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+            channels, height, width, pooled_h, pooled_w, top_data, argmax_data);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+  return 1;
+}
+
+template <typename scalar_t>
+__global__ void ROIPoolBackward(const int nthreads, const scalar_t *top_diff,
+                                const scalar_t *rois, const int *argmax_data,
+                                const scalar_t spatial_scale,
+                                const int channels, const int height,
+                                const int width, const int pooled_h,
+                                const int pooled_w, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_w;
+    int ph = (index / pooled_w) % pooled_h;
+    int c = (index / pooled_w / pooled_h) % channels;
+    int n = index / pooled_w / pooled_h / channels;
+
+    int roi_batch_ind = rois[n * 5];
+    int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
+                                   ph * pooled_w + pw];
+
+    atomicAdd(bottom_diff + (roi_batch_ind * channels + c) * height * width +
+                  bottom_index,
+              top_diff[index]);
+  }
+}
+
+template <>
+__global__ void
+ROIPoolBackward<double>(const int nthreads, const double *top_diff,
+                        const double *rois, const int *argmax_data,
+                        const double spatial_scale, const int channels,
+                        const int height, const int width, const int pooled_h,
+                        const int pooled_w, double *bottom_diff) {
+  // CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  //   int pw = index % pooled_w;
+  //   int ph = (index / pooled_w) % pooled_h;
+  //   int c = (index / pooled_w / pooled_h) % channels;
+  //   int n = index / pooled_w / pooled_h / channels;
+
+  //   int roi_batch_ind = rois[n * 5];
+  //   int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
+  //                                  ph * pooled_w + pw];
+
+  //   *(bottom_diff + (roi_batch_ind * channels + c) * height * width +
+  //                 bottom_index) +=top_diff[index];
+  // }
+}
+
+int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                           const at::Tensor argmax, const float spatial_scale,
+                           const int batch_size, const int channels,
+                           const int height, const int width,
+                           const int num_rois, const int pooled_h,
+                           const int pooled_w, at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_h * pooled_w * channels;
+
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.type(), "ROIPoolLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        const int *argmax_data = argmax.data<int>();
+        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+
+        if (sizeof(scalar_t) == sizeof(double)) {
+          fprintf(stderr, "double is not supported\n");
+          exit(-1);
+        }
+
+        ROIPoolBackward<
+            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+            output_size, top_diff, rois_data, argmax_data,
+            scalar_t(spatial_scale), channels, height, width, pooled_h,
+            pooled_w, bottom_diff);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
--- a/mmdet/version.py
+++ b/mmdet/version.py
+__version__ = '0.5.0'
--- a/setup.py
+++ b/setup.py
+from setuptools import find_packages, setup
+
+
+def readme():
+    with open('README.md') as f:
+        content = f.read()
+    return content
+
+
+def get_version():
+    version_file = 'mmcv/version.py'
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+setup(
+    name='mmdet',
+    version=get_version(),
+    description='Open MMLab Detection Toolbox',
+    long_description=readme(),
+    keywords='computer vision, object detection',
+    packages=find_packages(),
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Topic :: Utilities',
+    ],
+    license='GPLv3',
+    setup_requires=['pytest-runner'],
+    tests_require=['pytest'],
+    install_requires=['numpy', 'matplotlib', 'six', 'terminaltables'],
+    zip_safe=False)