add ext ops, support parrots (#310)

* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>

add ext ops, support parrots (#310)
* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>
c0f5492e · zhuyuanhao · GitHub · a7bf7701 · c0f5492e · c0f5492e
Unverified Commit c0f5492e authored Jun 28, 2020 by zhuyuanhao Committed by GitHub Jun 28, 2020
20 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -7,46 +7,51 @@ on: [push, pull_request]
 jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: [3.6, 3.7, 3.8]
    steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v1
+        uses: actions/setup-python@v1
-      with:
+        with:
-        python-version: ${{ matrix.python-version }}
+          python-version: ${{ matrix.python-version }}
-    - name: Install linting dependencies
+      - name: Install linting dependencies
-      run: |
+        run: |
-        python -m pip install --upgrade pip
+          python -m pip install --upgrade pip
-        pip install flake8 isort yapf
+          pip install flake8 isort yapf
-    - name: Lint with flake8
+      - name: Lint with flake8
-      run: flake8 --max-complexity 20 .
+        run: flake8 --max-complexity 20 .
-    - name: Lint with isort
+      - name: Lint with isort
-      run: isort -rc --check-only --diff mmcv/ tests/ examples/
+        run: isort -rc --check-only --diff mmcv/ tests/ examples/
-    - name: Format with yapf
+      - name: Format python codes with yapf
-      run: yapf -r -d mmcv/ tests/ examples/
+        run: yapf -r -d mmcv/ tests/ examples/
-    - name: Build and install
+      - name: Format c/cuda codes with clang-format
-      run: rm -rf .eggs && pip install -e .
+        uses: DoozyX/clang-format-lint-action@v0.6
-    - name: Install system dependencies
+        with:
-      run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+          source: mmcv/ops/csrc
-    - name: Install unittest dependencies
+          extensions: h,c,cpp,hpp,cu,cuh
-      run: |
+          style: google
-        pip install pytest coverage lmdb PyTurboJPEG
+      - name: Build and install
-        pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        run: rm -rf .eggs && pip install -e .
-    - name: Run unittests and generate coverage report
+      - name: Install system dependencies
-      run: |
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
-        coverage run --branch --source=mmcv -m pytest tests/
+      - name: Install unittest dependencies
-        coverage xml
+        run: |
-        coverage report -m
+          pip install pytest coverage lmdb PyTurboJPEG
-    - name: Upload coverage to Codecov
+          pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-      uses: codecov/codecov-action@master
+      - name: Run unittests and generate coverage report
-      with:
+        run: |
-        file: ./coverage.xml
+          coverage run --branch --source=mmcv -m pytest tests/
-        flags: unittests
+          coverage xml
-        env_vars: OS,PYTHON
+          coverage report -m
-        name: codecov-umbrella
+      - name: Upload coverage to Codecov
-        fail_ci_if_error: false
+        uses: codecov/codecov-action@master
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,3 +28,11 @@ repos:
        args: ["--remove"]
      - id: mixed-line-ending
        args: ["--fix=lf"]
+  - repo: local
+    hooks:
+      - id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat
+        entry: clang-format -i
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
--- a/README.rst
+++ b/README.rst
@@ -24,12 +24,12 @@ and `MMAction <https://github.com/open-mmlab/mmaction>`_.
 It provides the following functionalities.
 - Universal IO APIs
- Image processing
+- Image/Video processing
- Video processing
 - Image and annotation visualization
 - Useful utilities (progress bar, timer, ...)
 - PyTorch runner with hooking mechanism
 - Various CNN architectures
+- High-quality implementation of common CUDA ops
 See the `documentation <http://mmcv.readthedocs.io/en/latest>`_ for more features and usage.

--- a/docs/api.rst
+++ b/docs/api.rst
@@ -41,3 +41,8 @@ runner
 ------
 .. automodule:: mmcv.runner
    :members:
+ops
+------
+.. automodule:: mmcv.ops
+    :members:
\ No newline at end of file
--- a/docs/cnn.md
+++ b/docs/cnn.md
@@ -90,3 +90,44 @@ conv1 = nn.Conv2d(3, 3, 1)
 normal_init(conv1, std=0.01, bias=0)
 xavier_init(conv1, distribution='uniform')
 ```
+### Model Zoo
+Besides torchvision pre-trained models, we also provide pre-trained models of following CNN:
+- VGG Caffe
+- ResNet Caffe
+- ResNeXt
+- ResNet with Group Normalization
+- ResNet with Group Normalization and Weight Standardization
+- HRNetV2
+- Res2Net
+- RegNet
+#### Model URLs in JSON
+The model zoo links in MMCV are managed by JSON files.
+The json file consists of key-value pair of model name and its url or path.
+An example json file could be like:
+```json
+{
+    "model_a": "https://example.com/models/model_a_9e5bac.pth",
+    "model_b": "pretrain/model_b_ab3ef2c.pth"
+}
+```
+The default links of the pre-trained models hosted on Open-MMLab AWS could be found [here](../mmcv/model_zoo/open_mmlab.json).
+You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path.
+The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used.
+#### Load Checkpoint
+The following types are supported for `filename` argument of `mmcv.load_checkpoint()`.
+- filepath: The filepath of the checkpoint.
+- `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename.
+- `torchvison://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details.
+- `open-mmlab://xxx`: The model links or filepath provided in default and additional json files.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -48,7 +48,10 @@ extensions = [
    'recommonmark',
 ]
-autodoc_mock_imports = ['cv2', 'mmcv._ext', 'torchvision']
+autodoc_mock_imports = [
+    'cv2', 'mmcv._ext', 'mmcv._flow_warp_ext', 'mmcv.utils.ext_loader',
+    'torchvision'
+]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,7 +13,7 @@ Contents
   utils.md
   runner.md
   cnn.md
-   model_zoo.md
+   ops.md
   api.rst

--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
-## Model Zoo
-Besides torchvision pre-trained models, we also provide pre-trained models of following CNN:
-* VGG Caffe
-* ResNet Caffe
-* ResNeXt
-* ResNet with Group Normalization
-* ResNet with Group Normalization and Weight Standardization
-* HRNetV2
-* Res2Net
-* RegNet
-### Model URLs in JSON
-The model zoo links in MMCV are managed by JSON files.
-The json file consists of key-value pair of model name and its url or path.
-An example json file could be like:
-```json
-{
-    "model_a": "https://example.com/models/model_a_9e5bac.pth",
-    "model_b": "pretrain/model_b_ab3ef2c.pth"
-}
-```
-The default links of the pre-trained models hosted on Open-MMLab AWS could be found [here](../mmcv/model_zoo/open_mmlab.json).
-You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path.
-The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used.
-### Load Checkpoint
-The following types are supported for `filename` argument of `mmcv.load_checkpoint()`.
-* filepath: The filepath of the checkpoint.
-* `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename.
-* `torchvison://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details.
-* `open-mmlab://xxx`: The model links or filepath provided in default and additional json files.
--- a/docs/ops.md
+++ b/docs/ops.md
+## CUDA ops
+We implement common CUDA ops used in detection, segmentation, etc.
+- BBoxOverlaps
+- CARAFE
+- CrissCrossAttention
+- ContextBlock
+- CornerPool
+- Deformable Convolution v1/v2
+- Deformable RoIPool
+- GeneralizedAttention
+- MaskedConv
+- NMS
+- PSAMask
+- RoIPool
+- RoIAlign
+- SimpleRoIAlign
+- SigmoidFocalLoss
+- SoftmaxFocalLoss
+- SoftNMS
+- Synchronized BatchNorm
+- Weight standardization
--- a/mmcv/__init__.py
+++ b/mmcv/__init__.py
@@ -12,3 +12,4 @@ from .visualization import *
 # without PyTorch.
 # - runner
 # - parallel
+# - op
--- a/mmcv/ops/__init__.py
+++ b/mmcv/ops/__init__.py
+from .bbox import bbox_overlaps
+from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
+from .cc_attention import CrissCrossAttention
+from .context_block import ContextBlock
+from .conv_ws import ConvWS2d, conv_ws_2d
+from .corner_pool import CornerPool
+from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
+from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
+                              ModulatedDeformRoIPoolPack, deform_roi_pool)
+from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
+                         sigmoid_focal_loss, softmax_focal_loss)
+from .generalized_attention import GeneralizedAttention
+from .info import get_compiler_version, get_compiling_cuda_version
+from .masked_conv import MaskedConv2d, masked_conv2d
+from .modulated_deform_conv import (ModulatedDeformConv2d,
+                                    ModulatedDeformConv2dPack,
+                                    modulated_deform_conv2d)
+from .nms import batched_nms, nms, nms_match, soft_nms
+from .plugin import build_plugin_layer
+from .point_sample import (SimpleRoIAlign, point_sample,
+                           rel_roi_point_to_rel_img_point)
+from .psa_mask import PSAMask
+from .roi_align import RoIAlign, roi_align
+from .roi_pool import RoIPool, roi_pool
+from .sync_bn import SyncBatchNorm
+from .wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d
+__all__ = [
+    'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
+    'carafe_naive', 'ContextBlock', 'ConvWS2d', 'conv_ws_2d', 'CornerPool',
+    'DeformConv2d', 'DeformConv2dPack', 'deform_conv2d', 'DeformRoIPool',
+    'DeformRoIPoolPack', 'ModulatedDeformRoIPoolPack', 'deform_roi_pool',
+    'SigmoidFocalLoss', 'SoftmaxFocalLoss', 'sigmoid_focal_loss',
+    'softmax_focal_loss', 'GeneralizedAttention', 'get_compiler_version',
+    'get_compiling_cuda_version', 'MaskedConv2d', 'masked_conv2d',
+    'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
+    'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
+    'build_plugin_layer', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool',
+    'SyncBatchNorm', 'Conv2d', 'ConvTranspose2d', 'Linear', 'MaxPool2d',
+    'CrissCrossAttention', 'PSAMask', 'point_sample',
+    'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign'
+]
--- a/mmcv/ops/bbox.py
+++ b/mmcv/ops/bbox.py
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
+    """Calculate overlap between two set of bboxes.
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+    Args:
+        bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
+            If aligned is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+    Returns:
+        ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> bbox_overlaps(bboxes1, bboxes2)
+        tensor([[0.5000, 0.0000, 0.0000],
+                [0.0000, 0.0000, 1.0000],
+                [0.0000, 0.0000, 0.0000]])
+    Example:
+        >>> empty = torch.FloatTensor([])
+        >>> nonempty = torch.FloatTensor([
+        >>>     [0, 0, 10, 9],
+        >>> ])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+    mode_dict = {'iou': 0, 'iof': 1}
+    assert mode in mode_dict.keys()
+    mode_flag = mode_dict[mode]
+    # Either the boxes are empty or the length of boxes's last dimenstion is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+    assert offset == 1 or offset == 0
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        assert rows == cols
+    if rows * cols == 0:
+        return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros((rows, cols))
+    ext_module.bbox_overlaps(
+        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
+    return ious
--- a/mmcv/ops/carafe.py
+++ b/mmcv/ops/carafe.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.nn.modules.module import Module
+from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext', [
+    'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
+    'carafe_backward'
+])
+class CARAFENaiveFunction(Function):
+    @staticmethod
+    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
+        return g.op(
+            'MMCVCARAFENaive',
+            features,
+            masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+    @staticmethod
+    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        ext_module.carafe_naive_forward(
+            features,
+            masks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+        if features.requires_grad or masks.requires_grad:
+            ctx.save_for_backward(features, masks)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+        features, masks = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+        grad_input = torch.zeros_like(features)
+        grad_masks = torch.zeros_like(masks)
+        ext_module.carafe_naive_backward(
+            grad_output.contiguous(),
+            features,
+            masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+        return grad_input, grad_masks, None, None, None
+carafe_naive = CARAFENaiveFunction.apply
+class CARAFENaive(Module):
+    def __init__(self, kernel_size, group_size, scale_factor):
+        super(CARAFENaive, self).__init__()
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+    def forward(self, features, masks):
+        return carafe_naive(features, masks, self.kernel_size, self.group_size,
+                            self.scale_factor)
+class CARAFEFunction(Function):
+    @staticmethod
+    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
+        return g.op(
+            'MMCVCARAFE',
+            features,
+            masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+    @staticmethod
+    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        routput = features.new_zeros(output.size(), requires_grad=False)
+        rfeatures = features.new_zeros(features.size(), requires_grad=False)
+        rmasks = masks.new_zeros(masks.size(), requires_grad=False)
+        ext_module.carafe_forward(
+            features,
+            masks,
+            rfeatures,
+            routput,
+            rmasks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+        if features.requires_grad or masks.requires_grad:
+            ctx.save_for_backward(features, masks, rfeatures)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+        features, masks, rfeatures = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input = torch.zeros_like(features, requires_grad=False)
+        rgrad_masks = torch.zeros_like(masks, requires_grad=False)
+        grad_input = torch.zeros_like(features, requires_grad=False)
+        grad_masks = torch.zeros_like(masks, requires_grad=False)
+        ext_module.carafe_backward(
+            grad_output.contiguous(),
+            rfeatures,
+            masks,
+            rgrad_output,
+            rgrad_input_hs,
+            rgrad_input,
+            rgrad_masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+        return grad_input, grad_masks, None, None, None
+carafe = CARAFEFunction.apply
+class CARAFE(Module):
+    """ CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+    Args:
+        kernel_size (int): reassemble kernel size
+        group_size (int): reassemble group size
+        scale_factor (int): upsample ratio
+    Returns:
+        upsampled feature map
+    """
+    def __init__(self, kernel_size, group_size, scale_factor):
+        super(CARAFE, self).__init__()
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+    def forward(self, features, masks):
+        return carafe(features, masks, self.kernel_size, self.group_size,
+                      self.scale_factor)
+@UPSAMPLE_LAYERS.register_module(name='carafe')
+class CARAFEPack(nn.Module):
+    """A unified package of CARAFE upsampler that contains: 1) channel
+    compressor 2) content encoder 3) CARAFE op.
+    Official implementation of ICCV 2019 paper
+    CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+    Args:
+        channels (int): input feature channels
+        scale_factor (int): upsample ratio
+        up_kernel (int): kernel size of CARAFE op
+        up_group (int): group size of CARAFE op
+        encoder_kernel (int): kernel size of content encoder
+        encoder_dilation (int): dilation of content encoder
+        compressed_channels (int): output channels of channels compressor
+    Returns:
+        upsampled feature map
+    """
+    def __init__(self,
+                 channels,
+                 scale_factor,
+                 up_kernel=5,
+                 up_group=1,
+                 encoder_kernel=3,
+                 encoder_dilation=1,
+                 compressed_channels=64):
+        super(CARAFEPack, self).__init__()
+        self.channels = channels
+        self.scale_factor = scale_factor
+        self.up_kernel = up_kernel
+        self.up_group = up_group
+        self.encoder_kernel = encoder_kernel
+        self.encoder_dilation = encoder_dilation
+        self.compressed_channels = compressed_channels
+        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
+                                            1)
+        self.content_encoder = nn.Conv2d(
+            self.compressed_channels,
+            self.up_kernel * self.up_kernel * self.up_group *
+            self.scale_factor * self.scale_factor,
+            self.encoder_kernel,
+            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
+            dilation=self.encoder_dilation,
+            groups=1)
+        self.init_weights()
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+        normal_init(self.content_encoder, std=0.001)
+    def kernel_normalizer(self, mask):
+        mask = F.pixel_shuffle(mask, self.scale_factor)
+        n, mask_c, h, w = mask.size()
+        mask_channel = int(mask_c / (self.up_kernel * self.up_kernel))
+        mask = mask.view(n, mask_channel, -1, h, w)
+        mask = F.softmax(mask, dim=2)
+        mask = mask.view(n, mask_c, h, w).contiguous()
+        return mask
+    def feature_reassemble(self, x, mask):
+        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
+        return x
+    def forward(self, x):
+        compressed_x = self.channel_compressor(x)
+        mask = self.content_encoder(compressed_x)
+        mask = self.kernel_normalizer(mask)
+        x = self.feature_reassemble(x, mask)
+        return x
--- a/mmcv/ops/cc_attention.py
+++ b/mmcv/ops/cc_attention.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd.function import once_differentiable
+from mmcv.cnn import Scale
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ca_forward', 'ca_backward', 'ca_map_forward', 'ca_map_backward'])
+class CAWeightFunction(torch.autograd.Function):
+    @staticmethod
+    def symbolic(g, t, f):
+        return g.op('MMCVCAWeight', t, f)
+    @staticmethod
+    def forward(ctx, t, f):
+        n, c, h, w = t.size()
+        weight = torch.zeros(n, h + w - 1, h, w).to(t.device)
+        ext_module.ca_forward(t, f, weight)
+        ctx.save_for_backward(t, f)
+        return weight
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dw):
+        t, f = ctx.saved_tensors
+        dt = torch.zeros_like(t)
+        df = torch.zeros_like(f)
+        ext_module.ca_backward(dw, t, f, dt, df)
+        return dt, df
+class CAMapFunction(torch.autograd.Function):
+    @staticmethod
+    def symbolic(g, weight, v):
+        return g.op('MMCVCAMap', weight, v)
+    @staticmethod
+    def forward(ctx, weight, v):
+        out = torch.zeros_like(v)
+        ext_module.ca_map_forward(weight, v, out)
+        ctx.save_for_backward(weight, v)
+        return out
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dout):
+        weight, v = ctx.saved_tensors
+        dw = torch.zeros_like(weight)
+        dv = torch.zeros_like(v)
+        ext_module.ca_map_backward(dout, weight, v, dw, dv)
+        return dw, dv
+ca_weight = CAWeightFunction.apply
+ca_map = CAMapFunction.apply
+class CrissCrossAttention(nn.Module):
+    """Criss-Cross Attention Module."""
+    def __init__(self, in_channels):
+        super(CrissCrossAttention, self).__init__()
+        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
+        self.gamma = Scale(0.)
+        self.in_channels = in_channels
+    def forward(self, x):
+        proj_query = self.query_conv(x)
+        proj_key = self.key_conv(x)
+        proj_value = self.value_conv(x)
+        energy = ca_weight(proj_query, proj_key)
+        attention = F.softmax(energy, 1)
+        out = ca_map(attention, proj_value)
+        out = self.gamma(out) + x
+        return out
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels})'
+        return s
--- a/mmcv/ops/context_block.py
+++ b/mmcv/ops/context_block.py
+import torch
+from torch import nn
+from ..cnn import constant_init, kaiming_init
+def last_zero_init(m):
+    if isinstance(m, nn.Sequential):
+        constant_init(m[-1], val=0)
+    else:
+        constant_init(m, val=0)
+class ContextBlock(nn.Module):
+    """ContextBlock module in GCNet.
+    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    (https://arxiv.org/abs/1904.11492) for details.
+    Args:
+        in_channels (int): Channels of the input feature map.
+        ratio (float): Ratio of channels of transform bottleneck
+        pooling_type (str): Pooling method for context modeling
+        fusion_types (list[str]|tuple[str]): Fusion method for feature fusion,
+            options: 'channels_add', 'channel_mul'
+    """
+    def __init__(self,
+                 in_channels,
+                 ratio,
+                 pooling_type='att',
+                 fusion_types=('channel_add', )):
+        super(ContextBlock, self).__init__()
+        assert pooling_type in ['avg', 'att']
+        assert isinstance(fusion_types, (list, tuple))
+        valid_fusion_types = ['channel_add', 'channel_mul']
+        assert all([f in valid_fusion_types for f in fusion_types])
+        assert len(fusion_types) > 0, 'at least one fusion should be used'
+        self.in_channels = in_channels
+        self.ratio = ratio
+        self.planes = int(in_channels * ratio)
+        self.pooling_type = pooling_type
+        self.fusion_types = fusion_types
+        if pooling_type == 'att':
+            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
+            self.softmax = nn.Softmax(dim=2)
+        else:
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        if 'channel_add' in fusion_types:
+            self.channel_add_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_add_conv = None
+        if 'channel_mul' in fusion_types:
+            self.channel_mul_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_mul_conv = None
+        self.reset_parameters()
+    def reset_parameters(self):
+        if self.pooling_type == 'att':
+            kaiming_init(self.conv_mask, mode='fan_in')
+            self.conv_mask.inited = True
+        if self.channel_add_conv is not None:
+            last_zero_init(self.channel_add_conv)
+        if self.channel_mul_conv is not None:
+            last_zero_init(self.channel_mul_conv)
+    def spatial_pool(self, x):
+        batch, channel, height, width = x.size()
+        if self.pooling_type == 'att':
+            input_x = x
+            # [N, C, H * W]
+            input_x = input_x.view(batch, channel, height * width)
+            # [N, 1, C, H * W]
+            input_x = input_x.unsqueeze(1)
+            # [N, 1, H, W]
+            context_mask = self.conv_mask(x)
+            # [N, 1, H * W]
+            context_mask = context_mask.view(batch, 1, height * width)
+            # [N, 1, H * W]
+            context_mask = self.softmax(context_mask)
+            # [N, 1, H * W, 1]
+            context_mask = context_mask.unsqueeze(-1)
+            # [N, 1, C, 1]
+            context = torch.matmul(input_x, context_mask)
+            # [N, C, 1, 1]
+            context = context.view(batch, channel, 1, 1)
+        else:
+            # [N, C, 1, 1]
+            context = self.avg_pool(x)
+        return context
+    def forward(self, x):
+        # [N, C, 1, 1]
+        context = self.spatial_pool(x)
+        out = x
+        if self.channel_mul_conv is not None:
+            # [N, C, 1, 1]
+            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
+            out = out * channel_mul_term
+        if self.channel_add_conv is not None:
+            # [N, C, 1, 1]
+            channel_add_term = self.channel_add_conv(context)
+            out = out + channel_add_term
+        return out
--- a/mmcv/ops/conv_ws.py
+++ b/mmcv/ops/conv_ws.py
+import torch.nn as nn
+import torch.nn.functional as F
+from ..cnn import CONV_LAYERS
+def conv_ws_2d(input,
+               weight,
+               bias=None,
+               stride=1,
+               padding=0,
+               dilation=1,
+               groups=1,
+               eps=1e-5):
+    c_in = weight.size(0)
+    weight_flat = weight.view(c_in, -1)
+    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    weight = (weight - mean) / (std + eps)
+    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
+@CONV_LAYERS.register_module('ConvWS')
+class ConvWS2d(nn.Conv2d):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 eps=1e-5):
+        super(ConvWS2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.eps = eps
+    def forward(self, x):
+        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
+                          self.dilation, self.groups, self.eps)
--- a/mmcv/ops/corner_pool.py
+++ b/mmcv/ops/corner_pool.py
+from torch import nn
+from torch.autograd import Function
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext', [
+    'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
+    'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
+    'right_pool_forward', 'right_pool_backward'
+])
+class TopPoolFunction(Function):
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.top_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.top_pool_backward(input, grad_output)
+        return output
+class BottomPoolFunction(Function):
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.bottom_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.bottom_pool_backward(input, grad_output)
+        return output
+class LeftPoolFunction(Function):
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.left_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.left_pool_backward(input, grad_output)
+        return output
+class RightPoolFunction(Function):
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.right_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.right_pool_backward(input, grad_output)
+        return output
+class CornerPool(nn.Module):
+    """Corner Pooling.
+    Corner Pooling is a new type of pooling layer that helps a
+    convolutional network better localize corners of bounding boxes.
+    Please refer to https://arxiv.org/abs/1808.01244 for more details.
+    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
+    Args:
+        mode(str): Pooling orientation for the pooling layer
+            - 'bottom': Bottom Pooling
+            - 'left': Left Pooling
+            - 'right': Right Pooling
+            - 'top': Top Pooling
+    Returns:
+        Feature map after pooling.
+    """
+    pool_functions = {
+        'bottom': BottomPoolFunction,
+        'left': LeftPoolFunction,
+        'right': RightPoolFunction,
+        'top': TopPoolFunction,
+    }
+    def __init__(self, mode):
+        super(CornerPool, self).__init__()
+        assert mode in self.pool_functions
+        self.corner_pool = self.pool_functions[mode]
+    def forward(self, x):
+        return self.corner_pool.apply(x)
--- a/mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh
+#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
+#define BBOX_OVERLAPS_CUDA_KERNEL_CUH
+template <typename T>
+__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
+                                          T* ious, const int num_bbox1,
+                                          const int num_bbox2, const int mode,
+                                          const bool aligned,
+                                          const int offset) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
+      int b1 = index;
+      int b2 = index;
+      int base1 = b1 * 4;
+      T b1_x1 = bbox1[base1];
+      T b1_y1 = bbox1[base1 + 1];
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+      int base2 = b2 * 4;
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
+      T b2_x2 = bbox2[base2 + 2];
+      T b2_y2 = bbox2[base2 + 3];
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T width = fmaxf(right - left + offset, 0.f);
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
+      ious[index] = interS / baseS;
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
+      int b1 = index / num_bbox2;
+      int b2 = index % num_bbox2;
+      int base1 = b1 * 4;
+      T b1_x1 = bbox1[base1];
+      T b1_y1 = bbox1[base1 + 1];
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+      int base2 = b2 * 4;
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
+      T b2_x2 = bbox2[base2 + 2];
+      T b2_y2 = bbox2[base2 + 3];
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T width = fmaxf(right - left + offset, 0.f);
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
+      ious[index] = interS / baseS;
+    }
+  }
+}
+#endif
--- a/mmcv/ops/csrc/carafe_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/carafe_cuda_kernel.cuh
+#define WARP_SIZE 32
+#define THREADS_PER_PIXEL 32
+#define MAX_SHARED_MEMORY 49152
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+#define MAXIMIZE_KERNEL_SIZE true
+#define kTileDim 32
+#define kBlockRows 8
+#define FULL_MASK 0xffffffff
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
+  for (int offset = 16; offset > 0; offset /= 2)
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+  return val;
+}
+template <>
+__device__ __forceinline__ phalf warpReduceSum(phalf val) {
+  for (int offset = 16; offset > 0; offset /= 2)
+    __PHALF(val) +=
+        __shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
+  return val;
+}
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template <typename scalar_t>
+__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
+                                           const int W, const int dh,
+                                           const int dw,
+                                           const scalar_t *__restrict__ X,
+                                           scalar_t *__restrict__ Y) {
+  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
+  const int n = blockIdx.x / (dh * dw);
+  const int k = blockIdx.x % (dh * dw);
+  const int r = k / dw;
+  const int c = k % dw;
+  const int offset = n * H * W;
+  int x = c * kTileDim + threadIdx.x;
+  int y = r * kTileDim + threadIdx.y;
+  if (x < W) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
+      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+    }
+  }
+  __syncthreads();
+  x = r * kTileDim + threadIdx.x;
+  y = c * kTileDim + threadIdx.y;
+  if (x < H) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
+      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void CARAFEForward(
+    const int num_kernels, const scalar_t *__restrict__ bottom_data,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+  const int start_w = down_pw - (kernel_size - 1) / 2;
+  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+  const int start_h = down_ph - (kernel_size - 1) / 2;
+  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy++) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, iy, ix, c, down_height, down_width, channels);
+        output_val += bottom_data[feat_index] *
+                      shared_mask[mask_c * WARP_SIZE + pixel_id];
+      }
+    }
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    top_data[top_index] = output_val;
+  }
+}
+template <typename scalar_t>
+__global__ void CARAFEBackward_Feature(
+    const int num_kernels, const scalar_t *__restrict__ top_diff,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels,
+    scalar_t *__restrict__ bottom_diff) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
+  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
+  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
+  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    const int mask_w = (c % kernel_size) * scale_factor;
+    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
+    const int mask_x = start_w + mask_w;
+    const int mask_y = start_h + mask_h;
+    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
+      shared_mask[c * WARP_SIZE + pixel_id] = 0;
+      continue;
+    }
+    const int mask_group = c / (kernel_size * kernel_size);
+    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
+    int mask_index =
+        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy += scale_factor) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix += scale_factor) {
+        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
+          continue;
+        }
+        int mask_iy =
+            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_ix =
+            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
+        output_val +=
+            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
+      }
+    }
+    bottom_diff[top_index] = output_val;
+  }
+}
+template <typename scalar_t>
+__global__ void FeatureSum(const int num_kernels,
+                           const scalar_t *__restrict__ input_data,
+                           const int scale_factor, const int channels,
+                           const int height, const int width,
+                           scalar_t *__restrict__ output_data) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    scalar_t output_val = 0;
+    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
+      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
+        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
+                                 width * scale_factor, channels);
+        output_val += input_data[input_id];
+      }
+    }
+    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
+    output_data[output_id] = output_val;
+  }
+}
+template <typename scalar_t>
+__global__ void CARAFEBackward_Mask(const int num_kernels,
+                                    const scalar_t *__restrict__ top_diff,
+                                    const scalar_t *__restrict__ bottom_data,
+                                    const int kernel_size, const int group_size,
+                                    const int scale_factor, const int channels,
+                                    const int down_height, const int down_width,
+                                    const int height, const int width,
+                                    const int mask_channels,
+                                    scalar_t *__restrict__ mask_diff) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int lane_id = index % WARP_SIZE;
+  index = index / WARP_SIZE;
+  const int mask_c = index % mask_channels;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / mask_channels;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+  const int mask_group = mask_c / (kernel_size * kernel_size);
+  const int mask_loc = mask_c % (kernel_size * kernel_size);
+  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
+  const int offset_y =
+      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
+  const int down_x = down_pw + offset_x;
+  const int down_y = down_ph + offset_y;
+  scalar_t output_val = 0;
+  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
+      down_x <= down_width - 1) {
+    const int channels_per_mask = ceilf(channels / (float)group_size);
+    const int start = channels_per_mask * mask_group;
+    const int end = min(channels_per_mask * (mask_group + 1), channels);
+    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
+      int bottom_id =
+          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
+      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
+      output_val += top_diff[top_id] * bottom_data[bottom_id];
+    }
+  }
+  __syncwarp();
+  output_val = warpReduceSum(output_val);
+  if (lane_id == 0) {
+    const int mask_id =
+        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
+    mask_diff[mask_id] = output_val;
+  }
+}
--- a/mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh
+#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
+#define CARAFE_NAIVE_CUDA_KERNEL_CUH
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+template <typename scalar_t>
+__global__ void carafe_naive_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+    scalar_t output_val = 0;
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
+      }
+    }
+    top_data[index] = output_val;
+  }
+}
+template <typename scalar_t>
+__global__ void carafe_naive_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
+    const int kernel_size, const int group_size, const int scale_factor,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        atomicAdd(bottom_diff + feat_index,
+                  bottom_masks[mask_index] * top_diff[index]);
+        atomicAdd(mask_diff + mask_index,
+                  bottom_data[feat_index] * top_diff[index]);
+      }
+    }
+  }
+}
+#endif