add ext ops, support parrots (#310)

* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>

add ext ops, support parrots (#310)
* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>
c0f5492e · zhuyuanhao · GitHub · a7bf7701 · c0f5492e · c0f5492e
Unverified Commit c0f5492e authored Jun 28, 2020 by zhuyuanhao Committed by GitHub Jun 28, 2020
20 changed files
--- a/mmcv/ops/csrc/sync_bn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/sync_bn_cuda_kernel.cuh
+#ifndef SYNC_BN_KERNEL_CUH
+#define SYNC_BN_KERNEL_CUH
+template <typename T>
+__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,
+                                                 int num, int channels,
+                                                 int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += input[index];
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+template <>
+__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input,
+                                                 float *mean, int num,
+                                                 int channels, int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += static_cast<float>(input[index]);
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+template <typename T>
+__global__ void sync_bn_forward_var_cuda_kernel(const T *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = input[index] - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+template <>
+__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = static_cast<float>(input[index]) - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+template <typename T>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const T *input, const float *mean, const float *var, float *running_mean,
+    float *running_var, const float *weight, const float *bias, float *norm,
+    float *std, T *output, int num, int channels, int spatial, float eps,
+    float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] = (input[index] - mean_value) / std_value;
+        output[index] = norm[index] * weight_value + bias_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            (input[index] - mean_value) / std_value * weight_value + bias_value;
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = norm[index] = (input[index] - mean_value) / std_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = (input[index] - mean_value) / std_value;
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+template <>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const phalf *input, const float *mean, const float *var,
+    float *running_mean, float *running_var, const float *weight,
+    const float *bias, float *norm, float *std, phalf *output, int num,
+    int channels, int spatial, float eps, float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] =
+            static_cast<phalf>(norm[index] * weight_value + bias_value);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /
+                                   std_value * weight_value +
+                               bias_value);
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] = static_cast<phalf>(norm[index]);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = static_cast<phalf>(
+            (static_cast<float>(input[index]) - mean_value) / std_value);
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+template <typename T>
+__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += grad_output[index] * norm[index];
+    buffer2[tid] += grad_output[index];
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+template <>
+__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];
+    buffer2[tid] += static_cast<float>(grad_output[index]);
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+template <typename T>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const T *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, T *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] =
+        weight[c] *
+        (grad_output[index] -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c];
+  }
+}
+template <>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const phalf *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, phalf *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] = static_cast<phalf>(
+        weight[c] *
+        (static_cast<float>(grad_output[index]) -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c]);
+  }
+}
+#endif  // SYNC_BN_KERNEL_CUH
--- a/mmcv/ops/deform_conv.py
+++ b/mmcv/ops/deform_conv.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
+ext_module = ext_loader.load_ext('_ext', [
+    'deform_conv_forward', 'deform_conv_backward_input',
+    'deform_conv_backward_parameters'
+])
+class DeformConv2dFunction(Function):
+    @staticmethod
+    def symbolic(g, input, offset, weight, stride, padding, dilation, groups,
+                 deform_groups, bias, im2col_step):
+        return g.op(
+            'MMCVDeformConv2d',
+            input,
+            offset,
+            weight,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            deform_groups=deform_groups,
+            bias=bias,
+            im2col_step=im2col_step)
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deform_groups=1,
+                bias=False,
+                im2col_step=32):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        assert bias is False, 'Only support bias is False.'
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.im2col_step = im2col_step
+        ctx.save_for_backward(input, offset, weight)
+        output = input.new_empty(
+            DeformConv2dFunction._output_size(ctx, input, weight))
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) %
+                cur_im2col_step) == 0, 'im2col step must divide batchsize'
+        ext_module.deform_conv_forward(
+            input,
+            weight,
+            offset,
+            output,
+            ctx.bufs_[0],
+            ctx.bufs_[1],
+            kW=weight.size(3),
+            kH=weight.size(2),
+            dW=ctx.stride[1],
+            dH=ctx.stride[0],
+            padW=ctx.padding[1],
+            padH=ctx.padding[0],
+            dilationW=ctx.dilation[1],
+            dilationH=ctx.dilation[0],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            im2col_step=cur_im2col_step)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+        grad_input = grad_offset = grad_weight = None
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) %
+                cur_im2col_step) == 0, 'im2col step must divide batchsize'
+        grad_output = grad_output.contiguous()
+        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+            grad_input = torch.zeros_like(input)
+            grad_offset = torch.zeros_like(offset)
+            ext_module.deform_conv_backward_input(
+                input,
+                offset,
+                grad_output,
+                grad_input,
+                grad_offset,
+                weight,
+                ctx.bufs_[0],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                im2col_step=cur_im2col_step)
+        if ctx.needs_input_grad[2]:
+            grad_weight = torch.zeros_like(weight)
+            ext_module.deform_conv_backward_parameters(
+                input,
+                offset,
+                grad_output,
+                grad_weight,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                scale=1,
+                im2col_step=cur_im2col_step)
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+deform_conv2d = DeformConv2dFunction.apply
+class DeformConv2d(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deform_groups=1,
+                 bias=False):
+        super(DeformConv2d, self).__init__()
+        assert in_channels % groups == 0, \
+            f'in_channels {in_channels} cannot be divisible by groups {groups}'
+        assert out_channels % groups == 0, \
+            f'out_channels {out_channels} cannot be divisible by groups \
+              {groups}'
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+        # only weight, no bias
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+        self.reset_parameters()
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+    def forward(self, x, offset):
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
+                                                          self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
+            offset = offset.contiguous()
+        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                            self.dilation, self.groups, self.deform_groups)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                      pad_w].contiguous()
+        return out
+@CONV_LAYERS.register_module('DCN')
+class DeformConv2dPack(DeformConv2d):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+    The spatial arrangement is like:
+    ```
+    (x0, y0) (x1, y1) (x2, y2)
+    (x3, y3) (x4, y4) (x5, y5)
+    (x6, y6) (x7, y7) (x8, y8)
+    ```
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+    _version = 2
+    def __init__(self, *args, **kwargs):
+        super(DeformConv2dPack, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+    def forward(self, x):
+        offset = self.conv_offset(x)
+        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                             self.dilation, self.groups, self.deform_groups)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+        if version is not None and version > 1:
+            print_log(
+                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
--- a/mmcv/ops/deform_roi_pool.py
+++ b/mmcv/ops/deform_roi_pool.py
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])
+class DeformRoIPoolFunction(Function):
+    @staticmethod
+    def symbolic(g, input, rois, offset, output_size, spatial_scale,
+                 sampling_ratio, gamma):
+        return g.op(
+            'MMCVDeformRoIPool',
+            input,
+            rois,
+            offset,
+            pooled_height=output_size[0],
+            pooled_width=output_size[1],
+            spatial_scale=spatial_scale,
+            sampling_ratio=sampling_ratio,
+            gamma=gamma)
+    @staticmethod
+    def forward(ctx,
+                input,
+                rois,
+                offset,
+                output_size,
+                spatial_scale=1.0,
+                sampling_ratio=0,
+                gamma=0.1):
+        if offset is None:
+            offset = input.new_zeros(0)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = float(spatial_scale)
+        ctx.sampling_ratio = int(sampling_ratio)
+        ctx.gamma = float(gamma)
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        ext_module.deform_roi_pool_forward(
+            input,
+            rois,
+            offset,
+            output,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+        ctx.save_for_backward(input, rois, offset)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, rois, offset = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(input.shape)
+        grad_offset = grad_output.new_zeros(offset.shape)
+        ext_module.deform_roi_pool_backward(
+            grad_output,
+            input,
+            rois,
+            offset,
+            grad_input,
+            grad_offset,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+        if grad_offset.numel() == 0:
+            grad_offset = None
+        return grad_input, None, grad_offset, None, None, None, None
+deform_roi_pool = DeformRoIPoolFunction.apply
+class DeformRoIPool(nn.Module):
+    def __init__(self,
+                 output_size,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(DeformRoIPool, self).__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.gamma = float(gamma)
+    def forward(self, input, rois, offset=None):
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+class DeformRoIPoolPack(DeformRoIPool):
+    def __init__(self,
+                 output_size,
+                 output_channels,
+                 deform_fc_channels=1024,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale,
+                                                sampling_ratio, gamma)
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+    def forward(self, input, rois):
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+class ModulatedDeformRoIPoolPack(DeformRoIPool):
+    def __init__(self,
+                 output_size,
+                 output_channels,
+                 deform_fc_channels=1024,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(ModulatedDeformRoIPoolPack,
+              self).__init__(output_size, spatial_scale, sampling_ratio, gamma)
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+        self.mask_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 1),
+            nn.Sigmoid())
+        self.mask_fc[2].weight.data.zero_()
+        self.mask_fc[2].bias.data.zero_()
+    def forward(self, input, rois):
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        mask = self.mask_fc(x.view(rois_num, -1))
+        mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])
+        d = deform_roi_pool(input, rois, offset, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        return d * mask
--- a/mmcv/ops/focal_loss.py
+++ b/mmcv/ops/focal_loss.py
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext', [
+    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',
+    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'
+])
+class SigmoidFocalLossFunction(Function):
+    @staticmethod
+    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+        return g.op(
+            'MMCVSigmoidFocalLoss',
+            input,
+            target,
+            gamma=gamma,
+            alpha=alpha,
+            weight=weight,
+            reduction=reduction)
+    @staticmethod
+    def forward(ctx,
+                input,
+                target,
+                gamma=2.0,
+                alpha=0.25,
+                weight=None,
+                reduction='mean'):
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+        output = input.new_zeros(input.size())
+        ext_module.sigmoid_focal_loss_forward(
+            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input, target, weight)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, target, weight = ctx.saved_tensors
+        grad_input = input.new_zeros(input.size())
+        ext_module.sigmoid_focal_loss_backward(
+            input,
+            target,
+            weight,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input.size(0)
+        return grad_input, None, None, None, None, None
+sigmoid_focal_loss = SigmoidFocalLossFunction.apply
+class SigmoidFocalLoss(nn.Module):
+    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
+        super(SigmoidFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+    def forward(self, input, target):
+        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
+class SoftmaxFocalLossFunction(Function):
+    @staticmethod
+    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+        return g.op(
+            'MMCVSoftmaxFocalLoss',
+            input,
+            target,
+            gamma=gamma,
+            alpha=alpha,
+            weight=weight,
+            reduction=reduction)
+    @staticmethod
+    def forward(ctx,
+                input,
+                target,
+                gamma=2.0,
+                alpha=0.25,
+                weight=None,
+                reduction='mean'):
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+        channel_stats, _ = torch.max(input, dim=1)
+        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)
+        input_softmax.exp_()
+        channel_stats = input_softmax.sum(dim=1)
+        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)
+        output = input.new_zeros(input.size(0))
+        ext_module.softmax_focal_loss_forward(
+            input_softmax,
+            target,
+            weight,
+            output,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input_softmax, target, weight)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_softmax, target, weight = ctx.saved_tensors
+        buff = input_softmax.new_zeros(input_softmax.size(0))
+        grad_input = input_softmax.new_zeros(input_softmax.size())
+        ext_module.softmax_focal_loss_backward(
+            input_softmax,
+            target,
+            weight,
+            buff,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input_softmax.size(0)
+        return grad_input, None, None, None, None, None
+softmax_focal_loss = SoftmaxFocalLossFunction.apply
+class SoftmaxFocalLoss(nn.Module):
+    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
+        super(SoftmaxFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+    def forward(self, input, target):
+        return softmax_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
--- a/mmcv/ops/generalized_attention.py
+++ b/mmcv/ops/generalized_attention.py
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..cnn import kaiming_init
+class GeneralizedAttention(nn.Module):
+    """GeneralizedAttention module.
+    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
+    (https://arxiv.org/abs/1711.07971) for details.
+    Args:
+        in_channels (int): Channels of the input feature map.
+        spatial_range (int): The spatial range.
+            -1 indicates no spatial range constraint.
+        num_heads (int): The head number of empirical_attention module.
+        position_embedding_dim (int): The position embedding dimension.
+        position_magnitude (int): A multiplier acting on coord difference.
+        kv_stride (int): The feature stride acting on key/value feature map.
+        q_stride (int): The feature stride acting on query feature map.
+        attention_type (str): A binary indicator string for indicating which
+            items in generalized empirical_attention module are used.
+            '1000' indicates 'query and key content' (appr - appr) item,
+            '0100' indicates 'query content and relative position'
+              (appr - position) item,
+            '0010' indicates 'key content only' (bias - appr) item,
+            '0001' indicates 'relative position only' (bias - position) item.
+    """
+    def __init__(self,
+                 in_channels,
+                 spatial_range=-1,
+                 num_heads=9,
+                 position_embedding_dim=-1,
+                 position_magnitude=1,
+                 kv_stride=2,
+                 q_stride=1,
+                 attention_type='1111'):
+        super(GeneralizedAttention, self).__init__()
+        # hard range means local range for non-local operation
+        self.position_embedding_dim = (
+            position_embedding_dim
+            if position_embedding_dim > 0 else in_channels)
+        self.position_magnitude = position_magnitude
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.spatial_range = spatial_range
+        self.kv_stride = kv_stride
+        self.q_stride = q_stride
+        self.attention_type = [bool(int(_)) for _ in attention_type]
+        self.qk_embed_dim = in_channels // num_heads
+        out_c = self.qk_embed_dim * num_heads
+        if self.attention_type[0] or self.attention_type[1]:
+            self.query_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.query_conv.kaiming_init = True
+        if self.attention_type[0] or self.attention_type[2]:
+            self.key_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.key_conv.kaiming_init = True
+        self.v_dim = in_channels // num_heads
+        self.value_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=self.v_dim * num_heads,
+            kernel_size=1,
+            bias=False)
+        self.value_conv.kaiming_init = True
+        if self.attention_type[1] or self.attention_type[3]:
+            self.appr_geom_fc_x = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_x.kaiming_init = True
+            self.appr_geom_fc_y = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_y.kaiming_init = True
+        if self.attention_type[2]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.appr_bias = nn.Parameter(appr_bias_value)
+        if self.attention_type[3]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.geom_bias = nn.Parameter(geom_bias_value)
+        self.proj_conv = nn.Conv2d(
+            in_channels=self.v_dim * num_heads,
+            out_channels=in_channels,
+            kernel_size=1,
+            bias=True)
+        self.proj_conv.kaiming_init = True
+        self.gamma = nn.Parameter(torch.zeros(1))
+        if self.spatial_range >= 0:
+            # only works when non local is after 3*3 conv
+            if in_channels == 256:
+                max_len = 84
+            elif in_channels == 512:
+                max_len = 42
+            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
+            local_constraint_map = np.ones(
+                (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int)
+            for iy in range(max_len):
+                for ix in range(max_len):
+                    local_constraint_map[
+                        iy, ix,
+                        max((iy - self.spatial_range) //
+                            self.kv_stride, 0):min((iy + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len),
+                        max((ix - self.spatial_range) //
+                            self.kv_stride, 0):min((ix + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len)] = 0
+            self.local_constraint_map = nn.Parameter(
+                torch.from_numpy(local_constraint_map).byte(),
+                requires_grad=False)
+        if self.q_stride > 1:
+            self.q_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.q_stride)
+        else:
+            self.q_downsample = None
+        if self.kv_stride > 1:
+            self.kv_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.kv_stride)
+        else:
+            self.kv_downsample = None
+        self.init_weights()
+    def get_position_embedding(self,
+                               h,
+                               w,
+                               h_kv,
+                               w_kv,
+                               q_stride,
+                               kv_stride,
+                               device,
+                               feat_dim,
+                               wave_length=1000):
+        h_idxs = torch.linspace(0, h - 1, h).cuda(device)
+        h_idxs = h_idxs.view((h, 1)) * q_stride
+        w_idxs = torch.linspace(0, w - 1, w).cuda(device)
+        w_idxs = w_idxs.view((w, 1)) * q_stride
+        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).cuda(device)
+        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
+        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).cuda(device)
+        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
+        # (h, h_kv, 1)
+        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
+        h_diff *= self.position_magnitude
+        # (w, w_kv, 1)
+        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
+        w_diff *= self.position_magnitude
+        feat_range = torch.arange(0, feat_dim / 4).cuda(device)
+        dim_mat = torch.Tensor([wave_length]).cuda(device)
+        dim_mat = dim_mat**((4. / feat_dim) * feat_range)
+        dim_mat = dim_mat.view((1, 1, -1))
+        embedding_x = torch.cat(
+            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
+        embedding_y = torch.cat(
+            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
+        return embedding_x, embedding_y
+    def forward(self, x_input):
+        num_heads = self.num_heads
+        # use empirical_attention
+        if self.q_downsample is not None:
+            x_q = self.q_downsample(x_input)
+        else:
+            x_q = x_input
+        n, _, h, w = x_q.shape
+        if self.kv_downsample is not None:
+            x_kv = self.kv_downsample(x_input)
+        else:
+            x_kv = x_input
+        _, _, h_kv, w_kv = x_kv.shape
+        if self.attention_type[0] or self.attention_type[1]:
+            proj_query = self.query_conv(x_q).view(
+                (n, num_heads, self.qk_embed_dim, h * w))
+            proj_query = proj_query.permute(0, 1, 3, 2)
+        if self.attention_type[0] or self.attention_type[2]:
+            proj_key = self.key_conv(x_kv).view(
+                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))
+        if self.attention_type[1] or self.attention_type[3]:
+            position_embed_x, position_embed_y = self.get_position_embedding(
+                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
+                x_input.device, self.position_embedding_dim)
+            # (n, num_heads, w, w_kv, dim)
+            position_feat_x = self.appr_geom_fc_x(position_embed_x).\
+                view(1, w, w_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+            # (n, num_heads, h, h_kv, dim)
+            position_feat_y = self.appr_geom_fc_y(position_embed_y).\
+                view(1, h, h_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+            position_feat_x /= math.sqrt(2)
+            position_feat_y /= math.sqrt(2)
+        # accelerate for saliency only
+        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
+            appr_bias = self.appr_bias.\
+                view(1, num_heads, 1, self.qk_embed_dim).\
+                repeat(n, 1, 1, 1)
+            energy = torch.matmul(appr_bias, proj_key).\
+                view(n, num_heads, 1, h_kv * w_kv)
+            h = 1
+            w = 1
+        else:
+            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
+            if not self.attention_type[0]:
+                energy = torch.zeros(
+                    n,
+                    num_heads,
+                    h,
+                    w,
+                    h_kv,
+                    w_kv,
+                    dtype=x_input.dtype,
+                    device=x_input.device)
+            # attention_type[0]: appr - appr
+            # attention_type[1]: appr - position
+            # attention_type[2]: bias - appr
+            # attention_type[3]: bias - position
+            if self.attention_type[0] or self.attention_type[2]:
+                if self.attention_type[0] and self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+                    energy = torch.matmul(proj_query + appr_bias, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+                elif self.attention_type[0]:
+                    energy = torch.matmul(proj_query, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+                elif self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim).\
+                        repeat(n, 1, 1, 1)
+                    energy += torch.matmul(appr_bias, proj_key).\
+                        view(n, num_heads, 1, 1, h_kv, w_kv)
+            if self.attention_type[1] or self.attention_type[3]:
+                if self.attention_type[1] and self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+                    proj_query_reshape = (proj_query + geom_bias).\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+                    energy_x = torch.matmul(
+                        proj_query_reshape.permute(0, 1, 3, 2, 4),
+                        position_feat_x.permute(0, 1, 2, 4, 3))
+                    energy_x = energy_x.\
+                        permute(0, 1, 3, 2, 4).unsqueeze(4)
+                    energy_y = torch.matmul(
+                        proj_query_reshape,
+                        position_feat_y.permute(0, 1, 2, 4, 3))
+                    energy_y = energy_y.unsqueeze(5)
+                    energy += energy_x + energy_y
+                elif self.attention_type[1]:
+                    proj_query_reshape = proj_query.\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+                    proj_query_reshape = proj_query_reshape.\
+                        permute(0, 1, 3, 2, 4)
+                    position_feat_x_reshape = position_feat_x.\
+                        permute(0, 1, 2, 4, 3)
+                    position_feat_y_reshape = position_feat_y.\
+                        permute(0, 1, 2, 4, 3)
+                    energy_x = torch.matmul(proj_query_reshape,
+                                            position_feat_x_reshape)
+                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
+                    energy_y = torch.matmul(proj_query_reshape,
+                                            position_feat_y_reshape)
+                    energy_y = energy_y.unsqueeze(5)
+                    energy += energy_x + energy_y
+                elif self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, self.qk_embed_dim, 1).\
+                        repeat(n, 1, 1, 1)
+                    position_feat_x_reshape = position_feat_x.\
+                        view(n, num_heads, w*w_kv, self.qk_embed_dim)
+                    position_feat_y_reshape = position_feat_y.\
+                        view(n, num_heads, h * h_kv, self.qk_embed_dim)
+                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
+                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
+                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
+                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
+                    energy += energy_x + energy_y
+            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
+        if self.spatial_range >= 0:
+            cur_local_constraint_map = \
+                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
+                contiguous().\
+                view(1, 1, h*w, h_kv*w_kv)
+            energy = energy.masked_fill_(cur_local_constraint_map,
+                                         float('-inf'))
+        attention = F.softmax(energy, 3)
+        proj_value = self.value_conv(x_kv)
+        proj_value_reshape = proj_value.\
+            view((n, num_heads, self.v_dim, h_kv * w_kv)).\
+            permute(0, 1, 3, 2)
+        out = torch.matmul(attention, proj_value_reshape).\
+            permute(0, 1, 3, 2).\
+            contiguous().\
+            view(n, self.v_dim * self.num_heads, h, w)
+        out = self.proj_conv(out)
+        out = self.gamma * out + x_input
+        return out
+    def init_weights(self):
+        for m in self.modules():
+            if hasattr(m, 'kaiming_init') and m.kaiming_init:
+                kaiming_init(
+                    m,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    bias=0,
+                    distribution='uniform',
+                    a=1)
--- a/mmcv/ops/info.py
+++ b/mmcv/ops/info.py
+import torch
+if torch.__version__ == 'parrots':
+    import parrots
+    def get_compiler_version():
+        return 'GCC ' + parrots.version.compiler
+    def get_compiling_cuda_version():
+        return parrots.version.cuda
+else:
+    from ..utils import ext_loader
+    ext_module = ext_loader.load_ext(
+        '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
+    def get_compiler_version():
+        return ext_module.get_compiler_version()
+    def get_compiling_cuda_version():
+        return ext_module.get_compiling_cuda_version()
--- a/mmcv/ops/masked_conv.py
+++ b/mmcv/ops/masked_conv.py
+import math
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])
+class MaskedConv2dFunction(Function):
+    @staticmethod
+    def symbolic(g, features, mask, weight, bias, padding, stride):
+        return g.op(
+            'MMCVMaskedConv2d',
+            features,
+            mask,
+            weight,
+            bias,
+            padding=padding,
+            stride=stride)
+    @staticmethod
+    def forward(ctx, features, mask, weight, bias, padding=0, stride=1):
+        assert mask.dim() == 3 and mask.size(0) == 1
+        assert features.dim() == 4 and features.size(0) == 1
+        assert features.size()[2:] == mask.size()[1:]
+        pad_h, pad_w = _pair(padding)
+        stride_h, stride_w = _pair(stride)
+        if stride_h != 1 or stride_w != 1:
+            raise ValueError(
+                'Stride could not only be 1 in masked_conv2d currently.')
+        out_channel, in_channel, kernel_h, kernel_w = weight.size()
+        batch_size = features.size(0)
+        out_h = int(
+            math.floor((features.size(2) + 2 * pad_h -
+                        (kernel_h - 1) - 1) / stride_h + 1))
+        out_w = int(
+            math.floor((features.size(3) + 2 * pad_w -
+                        (kernel_h - 1) - 1) / stride_w + 1))
+        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
+        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
+        if mask_inds.numel() > 0:
+            mask_h_idx = mask_inds[:, 0].contiguous()
+            mask_w_idx = mask_inds[:, 1].contiguous()
+            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
+                                          mask_inds.size(0))
+            ext_module.masked_im2col_forward(
+                features,
+                mask_h_idx,
+                mask_w_idx,
+                data_col,
+                kernel_h=kernel_h,
+                kernel_w=kernel_w,
+                pad_h=pad_h,
+                pad_w=pad_w)
+            masked_output = torch.addmm(1, bias[:, None], 1,
+                                        weight.view(out_channel, -1), data_col)
+            ext_module.masked_col2im_forward(
+                masked_output,
+                mask_h_idx,
+                mask_w_idx,
+                output,
+                height=out_h,
+                width=out_w,
+                channels=out_channel)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        return (None, ) * 5
+masked_conv2d = MaskedConv2dFunction.apply
+class MaskedConv2d(nn.Conv2d):
+    """A MaskedConv2d which inherits the official Conv2d.
+    The masked forward doesn't implement the backward function and only
+    supports the stride parameter to be 1 currently.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super(MaskedConv2d,
+              self).__init__(in_channels, out_channels, kernel_size, stride,
+                             padding, dilation, groups, bias)
+    def forward(self, input, mask=None):
+        if mask is None:  # fallback to the normal Conv2d
+            return super(MaskedConv2d, self).forward(input)
+        else:
+            return masked_conv2d(input, mask, self.weight, self.bias,
+                                 self.padding)
--- a/mmcv/ops/merge_cells.py
+++ b/mmcv/ops/merge_cells.py
+from abc import abstractmethod
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..cnn import ConvModule
+class BaseMergeCell(nn.Module):
+    """The basic class for cells used in NAS-FPN and NAS-FCOS.
+    BaseMergeCell takes 2 inputs. After applying concolution
+    on them, they are resized to the target size. Then,
+    they go through binary_op, which depends on the type of cell.
+    If with_out_conv is True, the result of output will go through
+    another convolution layer.
+    Args:
+        in_channels (int): number of input channels in out_conv layer.
+        out_channels (int): number of output channels in out_conv layer.
+        with_out_conv (bool): Whether to use out_conv layer
+        out_conv_cfg (dict): Config dict for convolution layer, which should
+            contain "groups", "kernel_size", "padding", "bias" to build
+            out_conv layer.
+        out_norm_cfg (dict): Config dict for normalization layer in out_conv.
+        out_conv_order (tuple): The order of conv/norm/activation layers in
+            out_conv.
+        with_input1_conv (bool): Whether to use convolution on input1.
+        with_input2_conv (bool): Whether to use convolution on input2.
+        input_conv_cfg (dict): Config dict for building input1_conv layer and
+            input2_conv layer, which is expected to contain the type of
+            convolution.
+            Default: None, which means using conv2d.
+        input_norm_cfg (dict): Config dict for normalization layer in
+            input1_conv and input2_conv layer. Default: None.
+        upsample_mode (str): Interpolation method used to resize the output
+            of input1_conv and input2_conv to target size. Currently, we
+            support ['nearest', 'bilinear']. Default: 'nearest'.
+    """
+    def __init__(self,
+                 fused_channels=256,
+                 out_channels=256,
+                 with_out_conv=True,
+                 out_conv_cfg=dict(
+                     groups=1, kernel_size=3, padding=1, bias=True),
+                 out_norm_cfg=None,
+                 out_conv_order=('act', 'conv', 'norm'),
+                 with_input1_conv=False,
+                 with_input2_conv=False,
+                 input_conv_cfg=None,
+                 input_norm_cfg=None,
+                 upsample_mode='nearest'):
+        super(BaseMergeCell, self).__init__()
+        assert upsample_mode in ['nearest', 'bilinear']
+        self.with_out_conv = with_out_conv
+        self.with_input1_conv = with_input1_conv
+        self.with_input2_conv = with_input2_conv
+        self.upsample_mode = upsample_mode
+        if self.with_out_conv:
+            self.out_conv = ConvModule(
+                fused_channels,
+                out_channels,
+                **out_conv_cfg,
+                norm_cfg=out_norm_cfg,
+                order=out_conv_order)
+        self.input1_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input1_conv else nn.Sequential()
+        self.input2_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input2_conv else nn.Sequential()
+    def _build_input_conv(self, channel, conv_cfg, norm_cfg):
+        return ConvModule(
+            channel,
+            channel,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True)
+    @abstractmethod
+    def _binary_op(self, x1, x2):
+        pass
+    def _resize(self, x, size):
+        if x.shape[-2:] == size:
+            return x
+        elif x.shape[-2:] < size:
+            return F.interpolate(x, size=size, mode=self.upsample_mode)
+        else:
+            assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0
+            kernel_size = x.shape[-1] // size[-1]
+            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
+            return x
+    def forward(self, x1, x2, out_size=None):
+        assert x1.shape[:2] == x2.shape[:2]
+        assert out_size is None or len(out_size) == 2
+        if out_size is None:  # resize to larger one
+            out_size = max(x1.size()[2:], x2.size()[2:])
+        x1 = self.input1_conv(x1)
+        x2 = self.input2_conv(x2)
+        x1 = self._resize(x1, out_size)
+        x2 = self._resize(x2, out_size)
+        x = self._binary_op(x1, x2)
+        if self.with_out_conv:
+            x = self.out_conv(x)
+        return x
+class SumCell(BaseMergeCell):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(SumCell, self).__init__(in_channels, out_channels, **kwargs)
+    def _binary_op(self, x1, x2):
+        return x1 + x2
+class ConcatCell(BaseMergeCell):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(ConcatCell, self).__init__(in_channels * 2, out_channels,
+                                         **kwargs)
+    def _binary_op(self, x1, x2):
+        ret = torch.cat([x1, x2], dim=1)
+        return ret
+class GlobalPoolingCell(BaseMergeCell):
+    def __init__(self, in_channels=None, out_channels=None, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
+    def _binary_op(self, x1, x2):
+        x2_att = self.global_pool(x2).sigmoid()
+        return x2 + x2_att * x1
--- a/mmcv/ops/modulated_deform_conv.py
+++ b/mmcv/ops/modulated_deform_conv.py
+import math
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])
+class ModulatedDeformConv2dFunction(Function):
+    @staticmethod
+    def symbolic(g, input, offset, mask, weight, bias, stride, padding,
+                 dilation, groups, deform_groups):
+        return g.op(
+            'MMCVModulatedDeformConv2d',
+            input,
+            offset,
+            mask,
+            weight,
+            bias,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            deform_groups=deform_groups)
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deform_groups=1):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(0)  # fake tensor
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+           or input.requires_grad:
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConv2dFunction._output_size(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        ext_module.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        grad_output = grad_output.contiguous()
+        ext_module.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply
+class ModulatedDeformConv2d(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deform_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv2d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.init_weights()
+    def init_weights(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+    def forward(self, x, offset, mask):
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+@CONV_LAYERS.register_module('DCNv2')
+class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
+    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
+    layers.
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int): Same as nn.Conv2d, while tuple is not supported.
+        padding (int): Same as nn.Conv2d, while tuple is not supported.
+        dilation (int): Same as nn.Conv2d, while tuple is not supported.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+    _version = 2
+    def __init__(self, *args, **kwargs):
+        super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            bias=True)
+        self.init_weights()
+    def init_weights(self):
+        super(ModulatedDeformConv2dPack, self).init_weights()
+        if hasattr(self, 'conv_offset'):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+    def forward(self, x):
+        out = self.conv_offset(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, ModulatedDeformConvPack
+            # loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+        if version is not None and version > 1:
+            print_log(
+                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
--- a/mmcv/ops/nms.py
+++ b/mmcv/ops/nms.py
+import numpy as np
+import torch
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext', ['nms', 'softnms', 'nms_match'])
+def nms(boxes, scores, iou_threshold, offset=0):
+    """Dispatch to either CPU or GPU NMS implementations.
+    The input can be either torch tensor or numpy array. GPU NMS will be used
+    if the input is gpu tensor, otherwise CPU NMS
+    will be used. The returned type will always be the same as inputs.
+    Arguments:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the \
+            same data type as the input.
+    Example:
+        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
+        >>>                   [49.3, 32.9, 51.0, 35.3],
+        >>>                   [49.2, 31.8, 51.0, 35.4],
+        >>>                   [35.1, 11.5, 39.1, 15.7],
+        >>>                   [35.6, 11.8, 39.3, 14.2],
+        >>>                   [35.3, 11.5, 39.9, 14.5],
+        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\
+               dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = nms(boxes, scores, iou_threshold)
+        >>> assert len(inds) == len(dets) == 3
+    """
+    assert isinstance(boxes, (torch.Tensor, np.ndarray))
+    assert isinstance(scores, (torch.Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+    if torch.__version__ == 'parrots':
+        x1 = boxes[:, 0]
+        y1 = boxes[:, 1]
+        x2 = boxes[:, 2]
+        y2 = boxes[:, 3]
+        areas = (x2 - x1 + offset) * (y2 - y1 + offset)
+        _, order = scores.sort(0, descending=True)
+        if boxes.device == 'cpu':
+            indata_list = [boxes, order, areas]
+            indata_dict = {
+                'iou_threshold': float(iou_threshold),
+                'offset': int(offset)
+            }
+            select = ext_module.nms(*indata_list, **indata_dict).byte()
+        else:
+            boxes_sorted = boxes.index_select(0, order)
+            indata_list = [boxes_sorted, order, areas]
+            indata_dict = {
+                'iou_threshold': float(iou_threshold),
+                'offset': int(offset)
+            }
+            select = ext_module.nms(*indata_list, **indata_dict)
+        inds = order.masked_select(select)
+    else:
+        inds = ext_module.nms(
+            boxes,
+            scores,
+            iou_threshold=float(iou_threshold),
+            offset=int(offset))
+    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+    return dets, inds
+def soft_nms(boxes,
+             scores,
+             iou_threshold=0.3,
+             sigma=0.5,
+             min_score=1e-3,
+             method='linear',
+             offset=0):
+    """Dispatch to only CPU Soft NMS implementations.
+    The input can be either a torch tensor or numpy array.
+    The returned type will always be the same as inputs.
+    Arguments:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        sigma (float): hyperparameter for gaussian method
+        min_score (float): score filter threshold
+        method (str): either 'linear' or 'gaussian'
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the \
+            same data type as the input.
+    Example:
+        >>> boxes = np.array([[4., 3., 5., 3.],
+        >>>                   [4., 3., 5., 4.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
+        >>> assert len(inds) == len(dets) == 5
+    """
+    assert isinstance(boxes, (torch.Tensor, np.ndarray))
+    assert isinstance(scores, (torch.Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}
+    assert method in method_dict.keys()
+    if torch.__version__ == 'parrots':
+        x1 = boxes[:, 0]
+        y1 = boxes[:, 1]
+        x2 = boxes[:, 2]
+        y2 = boxes[:, 3]
+        areas = (x2 - x1 + offset) * (y2 - y1 + offset)
+        indata_list = [boxes.cpu(), scores.cpu(), areas.cpu()]
+        indata_dict = {
+            'iou_threshold': float(iou_threshold),
+            'sigma': float(sigma),
+            'min_score': min_score,
+            'method': method_dict[method],
+            'offset': int(offset)
+        }
+        dets, inds, num_out = ext_module.softnms(*indata_list, **indata_dict)
+        inds = inds[:num_out]
+    else:
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        inds = ext_module.softnms(
+            boxes.cpu(),
+            scores.cpu(),
+            dets.cpu(),
+            iou_threshold=float(iou_threshold),
+            sigma=float(sigma),
+            min_score=float(min_score),
+            method=method_dict[method],
+            offset=int(offset))
+    dets = dets[:inds.size(0)]
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+        return dets, inds
+    else:
+        return dets.to(device=boxes.device), inds.to(device=boxes.device)
+def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
+    """Performs non-maximum suppression in a batched fashion.
+    Modified from https://github.com/pytorch/vision/blob
+    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+    Arguments:
+        boxes (torch.Tensor): boxes in shape (N, 4).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict): specify nms type and other parameters like iou_thr.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class
+    Returns:
+        tuple: kept dets and indice.
+    """
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        max_coordinate = boxes.max()
+        offsets = idxs.to(boxes) * (max_coordinate + 1)
+        boxes_for_nms = boxes + offsets[:, None]
+    nms_type = nms_cfg_.pop('type', 'nms')
+    nms_op = eval(nms_type)
+    dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
+    boxes = boxes[keep]
+    scores = dets[:, -1]
+    return torch.cat([boxes, scores[:, None]], -1), keep
+def nms_match(dets, iou_threshold):
+    """Matched dets into different groups by NMS.
+    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
+    record the indice of supporessed bbox and form a group with the indice of
+    kept bbox. In each group, indice is sorted as score order.
+    Arguments:
+        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
+        iou_thr (float): IoU thresh for NMS.
+    Returns:
+        List[Tensor | ndarray]: The outer list corresponds different matched
+            group, the inner Tensor corresponds the indices for a group in
+            score order.
+    """
+    if dets.shape[0] == 0:
+        matched = []
+    else:
+        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
+                                    f'but get {dets.shape}'
+        if isinstance(dets, torch.Tensor):
+            dets_t = dets.detach().cpu()
+        else:
+            dets_t = torch.from_numpy(dets)
+        matched = ext_module.nms_match(dets_t, float(iou_threshold))
+    if isinstance(dets, torch.Tensor):
+        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
+    else:
+        return [np.array(m, dtype=np.int) for m in matched]
--- a/mmcv/ops/plugin.py
+++ b/mmcv/ops/plugin.py
+from ..cnn import ConvModule, NonLocal2d
+from .context_block import ContextBlock
+from .generalized_attention import GeneralizedAttention
+plugin_cfg = {
+    # format: layer_type: (abbreviation, module)
+    'ContextBlock': ('context_block', ContextBlock),
+    'GeneralizedAttention': ('gen_attention_block', GeneralizedAttention),
+    'NonLocal2d': ('nonlocal_block', NonLocal2d),
+    'ConvModule': ('conv_block', ConvModule),
+}
+def build_plugin_layer(cfg, postfix='', **kwargs):
+    """Build plugin layer.
+    Args:
+        cfg (None or dict): cfg should contain:
+            type (str): identify plugin layer type.
+            layer args: args needed to instantiate a plugin layer.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer.
+    Returns:
+        name (str): abbreviation + postfix
+        layer (nn.Module): created plugin layer
+    """
+    assert isinstance(cfg, dict) and 'type' in cfg
+    cfg_ = cfg.copy()
+    layer_type = cfg_.pop('type')
+    if layer_type not in plugin_cfg:
+        raise KeyError(f'Unrecognized plugin type {layer_type}')
+    else:
+        abbr, plugin_layer = plugin_cfg[layer_type]
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+    layer = plugin_layer(**kwargs, **cfg_)
+    return name, layer
--- a/mmcv/ops/point_sample.py
+++ b/mmcv/ops/point_sample.py
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair
+def normalize(grid):
+    """Normalize input grid from [-1, 1] to [0, 1]
+    Args:
+        grid (Tensor): The grid to be normalize, range [-1, 1].
+    Returns:
+        Tensor: Normalized grid, range [0, 1].
+    """
+    return (grid + 1.0) / 2.0
+def denormalize(grid):
+    """Denormalize input grid from range [0, 1] to [-1, 1]
+    Args:
+        grid (Tensor): The grid to be denormalize, range [0, 1].
+    Returns:
+        Tensor: Denormalized grid, range [-1, 1].
+    """
+    return grid * 2.0 - 1.0
+def generate_grid(num_grid, size, device):
+    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
+    space.
+    Args:
+        num_grid (int): The number of grids to sample, one for each region.
+        size (tuple(int, int)): The side size of the regular grid.
+        device (torch.device): Desired device of returned tensor.
+    Returns:
+        (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that
+            contains coordinates for the regular grids.
+    """
+    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
+    grid = F.affine_grid(
+        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
+    grid = normalize(grid)
+    return grid.view(1, -1, 2).expand(num_grid, -1, -1)
+def rel_roi_point_to_abs_img_point(rois, rel_roi_points):
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+    Args:
+        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
+            RoI, location, range (0, 1), shape (N, P, 2)
+    Returns:
+        Tensor: Image based absolute point coordinates, shape (N, P, 2)
+    """
+    with torch.no_grad():
+        assert rel_roi_points.size(0) == rois.size(0)
+        assert rois.dim() == 2
+        assert rel_roi_points.dim() == 3
+        assert rel_roi_points.size(2) == 2
+        # remove batch idx
+        if rois.size(1) == 5:
+            rois = rois[:, 1:]
+        abs_img_points = rel_roi_points.clone()
+        abs_img_points[:, :, 0] = abs_img_points[:, :, 0] * (
+            rois[:, None, 2] - rois[:, None, 0])
+        abs_img_points[:, :, 1] = abs_img_points[:, :, 1] * (
+            rois[:, None, 3] - rois[:, None, 1])
+        abs_img_points[:, :, 0] += rois[:, None, 0]
+        abs_img_points[:, :, 1] += rois[:, None, 1]
+    return abs_img_points
+def abs_img_point_to_rel_img_point(abs_img_points,
+                                   img_shape,
+                                   spatial_scale=1.):
+    """Convert image based absolute point coordinates to image based relative
+    coordinates for sampling.
+    Args:
+        abs_img_points (Tensor): Image based absolute point coordinates,
+            shape (N, P, 2)
+        img_shape (tuple): (height, width) of image or feature map.
+        spatial_scale (float): Scale points by this factor. Default: 1.
+    Returns:
+        Tensor: Image based relative point coordinates for sampling,
+            shape (N, P, 2)
+    """
+    assert isinstance(img_shape, tuple) and len(img_shape) == 2
+    h, w = img_shape
+    scale = torch.tensor([w, h],
+                         dtype=torch.float,
+                         device=abs_img_points.device)
+    scale = scale.view(1, 1, 2)
+    rel_img_points = abs_img_points / scale * spatial_scale
+    return rel_img_points
+def rel_roi_point_to_rel_img_point(rois,
+                                   rel_roi_points,
+                                   img_shape,
+                                   spatial_scale=1.):
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+    Args:
+        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
+            RoI, location, range (0, 1), shape (N, P, 2)
+        img_shape (tuple): (height, width) of image or feature map.
+        spatial_scale (float): Scale points by this factor. Default: 1.
+    Returns:
+        Tensor: Image based relative point coordinates for sampling,
+            shape (N, P, 2)
+    """
+    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
+    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img_shape,
+                                                   spatial_scale)
+    return rel_img_point
+def point_sample(input, points, align_corners=False, **kwargs):
+    """A wrapper around :function:`grid_sample` to support 3D point_coords
+    tensors Unlike :function:`torch.nn.functional.grid_sample` it assumes
+    point_coords to lie inside [0, 1] x [0, 1] square.
+    Args:
+        input (Tensor): Feature map, shape (N, C, H, W).
+        points (Tensor): Image based absolute point coordinates (normalized),
+            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
+        align_corners (bool): Whether align_corners. Default: False
+    Returns:
+        Tensor: Features of `point` on `input`, shape (N, C, P) or
+            (N, C, Hgrid, Wgrid).
+    """
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = points.unsqueeze(2)
+    output = F.grid_sample(
+        input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = output.squeeze(3)
+    return output
+class SimpleRoIAlign(nn.Module):
+    def __init__(self, out_size, spatial_scale, aligned=True):
+        """Simple RoI align in PointRend, faster than standard RoIAlign.
+        Args:
+            out_size (tuple[int]): h, w
+            spatial_scale (float): scale the input boxes by this number
+            aligned (bool): if False, use the legacy implementation in
+                MMDetection, align_corners=True will be used in F.grid_sample.
+                If True, align the results more perfectly.
+        """
+        super(SimpleRoIAlign, self).__init__()
+        self.out_size = _pair(out_size)
+        self.spatial_scale = float(spatial_scale)
+        # to be consistent with other RoI ops
+        self.use_torchvision = False
+        self.aligned = aligned
+    def forward(self, features, rois):
+        num_imgs = features.size(0)
+        num_rois = rois.size(0)
+        rel_roi_points = generate_grid(
+            num_rois, self.out_size, device=rois.device)
+        point_feats = []
+        for batch_ind in range(num_imgs):
+            # unravel batch dim
+            feat = features[batch_ind].unsqueeze(0)
+            inds = (rois[:, 0].long() == batch_ind)
+            if inds.any():
+                rel_img_points = rel_roi_point_to_rel_img_point(
+                    rois[inds], rel_roi_points[inds], feat.shape[2:],
+                    self.spatial_scale).unsqueeze(0)
+                point_feat = point_sample(
+                    feat, rel_img_points, align_corners=not self.aligned)
+                point_feat = point_feat.squeeze(0).transpose(0, 1)
+                point_feats.append(point_feat)
+        channels = features.size(1)
+        roi_feats = torch.cat(point_feats, dim=0)
+        roi_feats = roi_feats.reshape(num_rois, channels, *self.out_size)
+        return roi_feats
+    def __repr__(self):
+        format_str = self.__class__.__name__
+        format_str += '(out_size={}, spatial_scale={}'.format(
+            self.out_size, self.spatial_scale)
+        return format_str
--- a/mmcv/ops/psa_mask.py
+++ b/mmcv/ops/psa_mask.py
+# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext',
+                                 ['psamask_forward', 'psamask_backward'])
+class PSAMaskFunction(Function):
+    @staticmethod
+    def symbolic(g, input, psa_type, mask_size):
+        return g.op(
+            'MMCVPSAMask', input, psa_type=psa_type, mask_size=mask_size)
+    @staticmethod
+    def forward(ctx, input, psa_type, mask_size):
+        ctx.psa_type = psa_type
+        ctx.mask_size = _pair(mask_size)
+        ctx.save_for_backward(input)
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        assert channels == h_mask * w_mask
+        output = input.new_zeros(
+            (batch_size, h_feature * w_feature, h_feature, w_feature))
+        ext_module.psamask_forward(
+            input,
+            output,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors[0]
+        psa_type = ctx.psa_type
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        grad_input = grad_output.new_zeros(
+            (batch_size, channels, h_feature, w_feature))
+        ext_module.psamask_backward(
+            grad_output,
+            grad_input,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return grad_input, None, None, None
+psa_mask = PSAMaskFunction.apply
+class PSAMask(nn.Module):
+    def __init__(self, psa_type, mask_size=None):
+        super(PSAMask, self).__init__()
+        assert psa_type in ['collect', 'distribute']
+        if psa_type == 'collect':
+            psa_type_enum = 0
+        else:
+            psa_type_enum = 1
+        self.psa_type_enum = psa_type_enum
+        self.mask_size = mask_size
+        self.psa_type = psa_type
+    def forward(self, input):
+        return psa_mask(input, self.psa_type_enum, self.mask_size)
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(psa_type={self.psa_type}, '
+        s += f'mask_size={self.mask_size})'
+        return s
--- a/mmcv/ops/roi_align.py
+++ b/mmcv/ops/roi_align.py
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_align_forward', 'roi_align_backward'])
+class RoIAlignFunction(Function):
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 pool_mode, aligned):
+        return g.op(
+            'MMCVRoIAlign',
+            input,
+            rois,
+            aligned_height=output_size[0],
+            aligned_weight=output_size[1],
+            spatial_scale=spatial_scale,
+            sampling_ratio=sampling_ratio,
+            pool_mode=pool_mode,
+            aligned=aligned)
+    @staticmethod
+    def forward(ctx,
+                input,
+                rois,
+                output_size,
+                spatial_scale=1.0,
+                sampling_ratio=0,
+                pool_mode='avg',
+                aligned=True):
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        assert pool_mode in ('max', 'avg')
+        ctx.pool_mode = 0 if pool_mode == 'max' else 1
+        ctx.aligned = aligned
+        ctx.input_shape = input.size()
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        if ctx.pool_mode == 0:
+            argmax_y = input.new_zeros(output_shape)
+            argmax_x = input.new_zeros(output_shape)
+        else:
+            argmax_y = input.new_zeros(0)
+            argmax_x = input.new_zeros(0)
+        ext_module.roi_align_forward(
+            input,
+            rois,
+            output,
+            argmax_y,
+            argmax_x,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+        ctx.save_for_backward(rois, argmax_y, argmax_x)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        rois, argmax_y, argmax_x = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        ext_module.roi_align_backward(
+            grad_output,
+            rois,
+            argmax_y,
+            argmax_x,
+            grad_input,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+        return grad_input, None, None, None, None, None, None
+roi_align = RoIAlignFunction.apply
+class RoIAlign(nn.Module):
+    """RoI align pooling layer.
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+        The meaning of aligned=True:
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+    def __init__(self,
+                 output_size,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 pool_mode='avg',
+                 aligned=True):
+        super(RoIAlign, self).__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N.\
+                The other 4 columns are xyxy.
+        """
+        return roi_align(input, rois, self.output_size, self.spatial_scale,
+                         self.sampling_ratio, self.pool_mode, self.aligned)
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'pool_mode={self.pool_mode}, '
+        s += f'aligned={self.aligned})'
+        return s
--- a/mmcv/ops/roi_pool.py
+++ b/mmcv/ops/roi_pool.py
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_pool_forward', 'roi_pool_backward'])
+class RoIPoolFunction(Function):
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale):
+        return g.op(
+            'MMCVRoIPool',
+            input,
+            rois,
+            pooled_height=output_size[0],
+            pooled_width=output_size[1],
+            spatial_scale=spatial_scale)
+    @staticmethod
+    def forward(ctx, input, rois, output_size, spatial_scale=1.0):
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.input_shape = input.size()
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        argmax = input.new_zeros(output_shape, dtype=torch.int)
+        ext_module.roi_pool_forward(
+            input,
+            rois,
+            output,
+            argmax,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+        ctx.save_for_backward(rois, argmax)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        rois, argmax = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        ext_module.roi_pool_backward(
+            grad_output,
+            rois,
+            argmax,
+            grad_input,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+        return grad_input, None, None, None
+roi_pool = RoIPoolFunction.apply
+class RoIPool(nn.Module):
+    def __init__(self, output_size, spatial_scale=1.0):
+        super(RoIPool, self).__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+    def forward(self, input, rois):
+        return roi_pool(input, rois, self.output_size, self.spatial_scale)
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
--- a/mmcv/ops/sync_bn.py
+++ b/mmcv/ops/sync_bn.py
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.module import Module
+from torch.nn.parameter import Parameter
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext', [
+    'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output',
+    'sync_bn_backward_param', 'sync_bn_backward_data'
+])
+class SyncBatchNormFunction(Function):
+    @staticmethod
+    def symbolic(g, input, running_mean, running_var, weight, bias, momentum,
+                 eps, group, group_size):
+        return g.op(
+            'MMCVSyncBatchNorm',
+            input,
+            running_mean,
+            running_var,
+            weight,
+            bias,
+            momentum=momentum,
+            eps=eps,
+            group=group,
+            group_size=group_size)
+    @staticmethod
+    def forward(self, input, running_mean, running_var, weight, bias, momentum,
+                eps, group, group_size):
+        self.momentum = momentum
+        self.eps = eps
+        self.group = group
+        self.group_size = group_size
+        assert isinstance(
+                   input, (torch.HalfTensor, torch.FloatTensor,
+                           torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \
+               f'only support Half or Float Tensor, but {input.type()}'
+        output = torch.empty_like(input)
+        input3d = input.view(input.size(0), input.size(1), -1)
+        output3d = output.view_as(input3d)
+        mean = torch.empty(
+            input3d.size(1), dtype=torch.float, device=input3d.device)
+        var = torch.empty(
+            input3d.size(1), dtype=torch.float, device=input3d.device)
+        if input3d.requires_grad or weight.requires_grad or bias.requires_grad:
+            norm = torch.empty_like(
+                input3d, dtype=torch.float, device=input3d.device)
+            std = torch.empty(
+                input3d.size(1), dtype=torch.float, device=input3d.device)
+        else:
+            norm = torch.empty(0, dtype=torch.float, device=input3d.device)
+            std = torch.empty(0, dtype=torch.float, device=input3d.device)
+        ext_module.sync_bn_forward_mean(input3d, mean)
+        if self.group_size > 1:
+            dist.all_reduce(mean, group=self.group)
+            mean /= self.group_size
+        ext_module.sync_bn_forward_var(input3d, mean, var)
+        if self.group_size > 1:
+            dist.all_reduce(var, group=self.group)
+            var /= self.group_size
+        ext_module.sync_bn_forward_output(
+            input3d,
+            mean,
+            var,
+            running_mean,
+            running_var,
+            weight,
+            bias,
+            norm,
+            std,
+            output3d,
+            eps=self.eps,
+            momentum=self.momentum,
+            group_size=self.group_size)
+        self.save_for_backward(norm, std, weight)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(self, grad_output):
+        norm, std, weight = self.saved_tensors
+        grad_weight = torch.empty_like(weight)
+        grad_bias = torch.empty_like(weight)
+        grad_input = torch.empty_like(grad_output)
+        grad_output3d = grad_output.view(
+            grad_output.size(0), grad_output.size(1), -1)
+        grad_input3d = grad_input.view_as(grad_output3d)
+        ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
+                                          grad_bias)
+        # all reduce
+        if self.group_size > 1:
+            dist.all_reduce(grad_weight, group=self.group)
+            dist.all_reduce(grad_bias, group=self.group)
+            grad_weight /= self.group_size
+            grad_bias /= self.group_size
+        ext_module.sync_bn_backward_data(grad_output3d, weight, grad_weight,
+                                         grad_bias, norm, std, grad_input3d)
+        return grad_input, None, None, grad_weight, grad_bias, \
+            None, None, None, None
+class SyncBatchNorm(Module):
+    def __init__(self,
+                 num_features,
+                 eps=1e-5,
+                 momentum=0.1,
+                 affine=True,
+                 track_running_stats=True,
+                 group=dist.group.WORLD):
+        super(SyncBatchNorm, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        self.group = group
+        self.group_size = dist.get_world_size(group)
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        if self.track_running_stats:
+            self.register_buffer('running_mean', torch.zeros(num_features))
+            self.register_buffer('running_var', torch.ones(num_features))
+            self.register_buffer('num_batches_tracked',
+                                 torch.tensor(0, dtype=torch.long))
+        else:
+            self.register_buffer('running_mean', None)
+            self.register_buffer('running_var', None)
+            self.register_buffer('num_batches_tracked', None)
+        self.reset_parameters()
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.running_mean.zero_()
+            self.running_var.fill_(1)
+            self.num_batches_tracked.zero_()
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.weight.data.uniform_()  # pytorch use ones_()
+            self.bias.data.zero_()
+    def forward(self, input):
+        if input.dim() < 2:
+            raise ValueError(
+                f'expected at least 2D input, got {input.dim()}D input')
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+        if self.training and self.track_running_stats:
+            if self.num_batches_tracked is not None:
+                self.num_batches_tracked += 1
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(
+                        self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+        if self.training or not self.track_running_stats:
+            return SyncBatchNormFunction.apply(input, self.running_mean,
+                                               self.running_var, self.weight,
+                                               self.bias,
+                                               exponential_average_factor,
+                                               self.eps, self.group,
+                                               self.group_size)
+        else:
+            return F.batch_norm(input, self.running_mean, self.running_var,
+                                self.weight, self.bias, False,
+                                exponential_average_factor, self.eps)
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'({self.num_features}, '
+        s += f'eps={self.eps}, '
+        s += f'momentum={self.momentum}, '
+        s += f'affine={self.affine}, '
+        s += f'track_running_stats={self.track_running_stats}, '
+        s += f'group_size={self.group_size})'
+        return s
--- a/mmcv/ops/wrappers.py
+++ b/mmcv/ops/wrappers.py
+"""Modified from https://github.com/facebookresearch/detectron2/blob/master.
+/detectron2/layers/wrappers.py Wrap some nn modules to support empty
+tensor input. Currently, these wrappers are mainly used in mask heads
+like fcn_mask_head and maskiou_heads since mask heads are trained on
+only positive RoIs.
+"""
+import math
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _pair
+from ..cnn import CONV_LAYERS
+class NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return NewEmptyTensorOp.apply(grad, shape), None
+@CONV_LAYERS.register_module('Conv', force=True)
+class Conv2d(nn.Conv2d):
+    def forward(self, x):
+        if x.numel() == 0 and torch.__version__ <= '1.4':
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+        return super().forward(x)
+class ConvTranspose2d(nn.ConvTranspose2d):
+    def forward(self, x):
+        if x.numel() == 0 and torch.__version__ <= '1.4.0':
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+        return super(ConvTranspose2d, self).forward(x)
+class MaxPool2d(nn.MaxPool2d):
+    def forward(self, x):
+        if x.numel() == 0 and torch.__version__ <= '1.4':
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
+                                     _pair(self.padding), _pair(self.stride),
+                                     _pair(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+        return super().forward(x)
+class Linear(torch.nn.Linear):
+    def forward(self, x):
+        if x.numel() == 0:
+            out_shape = [x.shape[0], self.out_features]
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+        return super().forward(x)
--- a/mmcv/utils/ext_loader.py
+++ b/mmcv/utils/ext_loader.py
+import importlib
+import os
+from collections import namedtuple
+import torch
+if torch.__version__ != 'parrots':
+    def load_ext(name, funcs):
+        ext = importlib.import_module('mmcv.' + name)
+        for fun in funcs:
+            assert hasattr(ext, fun), f'{fun} miss in module {name}'
+        return ext
+else:
+    from parrots import extension
+    def load_ext(name, funcs):
+        ExtModule = namedtuple('ExtModule', funcs)
+        ext_list = []
+        lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+        for fun in funcs:
+            if fun in ['nms', 'softnms']:
+                ext_list.append(extension.load(fun, name, lib_dir=lib_root).op)
+            else:
+                ext_list.append(
+                    extension.load(fun, name, lib_dir=lib_root).op_)
+        return ExtModule(*ext_list)
--- a/mmcv/version.py
+++ b/mmcv/version.py
 # Copyright (c) Open-MMLab. All rights reserved.
-__version__ = '0.6.2'
+__version__ = '1.0rc0'
--- a/mmcv/video/optflow.py
+++ b/mmcv/video/optflow.py
 # Copyright (c) Open-MMLab. All rights reserved.
 import numpy as np
-from mmcv._ext import flow_warp_c
+from mmcv._flow_warp_ext import flow_warp_c
 from mmcv.arraymisc import dequantize, quantize
 from mmcv.image import imread, imwrite
 from mmcv.utils import is_str