support v1.4.0

6f3c5f1c · limm · 6f674c7e · 6f3c5f1c · 6f674c7e · 6f3c5f1c
Commit 6f3c5f1c authored Jul 11, 2024 by limm
20 changed files
--- a/mmcv/ops/border_align.py
+++ b/mmcv/ops/border_align.py
@@ -2,8 +2,6 @@
 # modified from
 # https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
-from typing import Tuple
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -23,8 +21,7 @@ class BorderAlignFunction(Function):
            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
    @staticmethod
-    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
+    def forward(ctx, input, boxes, pool_size):
-                pool_size: int) -> torch.Tensor:
        ctx.pool_size = pool_size
        ctx.input_shape = input.size()
@@ -48,8 +45,7 @@ class BorderAlignFunction(Function):
    @staticmethod
    @once_differentiable
-    def backward(ctx,
+    def backward(ctx, grad_output):
-                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
        boxes, argmax_idx = ctx.saved_tensors
        grad_input = grad_output.new_zeros(ctx.input_shape)
        # complex head architecture may cause grad_output uncontiguous
@@ -76,25 +72,24 @@ class BorderAlign(nn.Module):
    For each border line (e.g. top, left, bottom or right) of each box,
    border_align does the following:
+        1. uniformly samples `pool_size`+1 positions on this line, involving \
-    1. uniformly samples ``pool_size`` +1 positions on this line, involving
+           the start and end points.
-       the start and end points.
+        2. the corresponding features on these points are computed by \
-    2. the corresponding features on these points are computed by bilinear
+           bilinear interpolation.
-       interpolation.
+        3. max pooling over all the `pool_size`+1 positions are used for \
-    3. max pooling over all the ``pool_size`` +1 positions are used for
+           computing pooled feature.
-       computing pooled feature.
    Args:
        pool_size (int): number of positions sampled over the boxes' borders
            (e.g. top, bottom, left, right).
    """
-    def __init__(self, pool_size: int):
+    def __init__(self, pool_size):
-        super().__init__()
+        super(BorderAlign, self).__init__()
        self.pool_size = pool_size
-    def forward(self, input: torch.Tensor,
+    def forward(self, input, boxes):
-                boxes: torch.Tensor) -> torch.Tensor:
        """
        Args:
            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
@@ -103,8 +98,8 @@ class BorderAlign(nn.Module):
            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
        Returns:
-            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+            Tensor: Pooled features with shape [N,C,H*W,4]. The order is
-            (top,left,bottom,right) for the last dimension.
+                (top,left,bottom,right) for the last dimension.
        """
        return border_align(input, boxes, self.pool_size)

--- a/mmcv/ops/box_iou_quadri.py
+++ b/mmcv/ops/box_iou_quadri.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from ..utils import ext_loader
-ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])
-def box_iou_quadri(bboxes1: torch.Tensor,
-                   bboxes2: torch.Tensor,
-                   mode: str = 'iou',
-                   aligned: bool = False) -> torch.Tensor:
-    """Return intersection-over-union (Jaccard index) of boxes.
-    Both sets of boxes are expected to be in
-    (x1, y1, ..., x4, y4) format.
-    If ``aligned`` is ``False``, then calculate the ious between each bbox
-    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
-    bboxes1 and bboxes2.
-    Args:
-        bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),
-            indicating (x1, y1, ..., x4, y4) for each row.
-        bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),
-            indicating (x1, y1, ..., x4, y4) for each row.
-        mode (str): "iou" (intersection over union) or iof (intersection over
-            foreground).
-    Returns:
-        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
-        ``False``, the shape of ious is (N, M) else (N,).
-    """
-    assert mode in ['iou', 'iof']
-    mode_dict = {'iou': 0, 'iof': 1}
-    mode_flag = mode_dict[mode]
-    rows = bboxes1.size(0)
-    cols = bboxes2.size(0)
-    if aligned:
-        ious = bboxes1.new_zeros(rows)
-    else:
-        ious = bboxes1.new_zeros(rows * cols)
-    bboxes1 = bboxes1.contiguous()
-    bboxes2 = bboxes2.contiguous()
-    ext_module.box_iou_quadri(
-        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
-    if not aligned:
-        ious = ious.view(rows, cols)
-    return ious
--- a/mmcv/ops/box_iou_rotated.py
+++ b/mmcv/ops/box_iou_rotated.py
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
 from ..utils import ext_loader
 ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
-def box_iou_rotated(bboxes1: torch.Tensor,
+def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
-                    bboxes2: torch.Tensor,
-                    mode: str = 'iou',
-                    aligned: bool = False,
-                    clockwise: bool = True) -> torch.Tensor:
    """Return intersection-over-union (Jaccard index) of boxes.
    Both sets of boxes are expected to be in
@@ -20,110 +14,18 @@ def box_iou_rotated(bboxes1: torch.Tensor,
    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
    bboxes1 and bboxes2.
-    .. note::
+    Arguments:
-        The operator assumes:
+        boxes1 (Tensor): rotated bboxes 1. \
+            It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
-        1) The positive direction along x axis is left -> right.
+            Note that theta is in radian.
+        boxes2 (Tensor): rotated bboxes 2. \
-        2) The positive direction along y axis is top -> down.
+            It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
+            Note that theta is in radian.
-        3) The w border is in parallel with x axis when angle = 0.
-        However, there are 2 opposite definitions of the positive angular
-        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
-        both definitions and uses CW by default.
-        Please set ``clockwise=False`` if you are using the CCW definition.
-        The coordinate system when ``clockwise`` is ``True`` (default)
-            .. code-block:: none
-                0-------------------> x (0 rad)
-                |  A-------------B
-                |  |             |
-                |  |     box     h
-                |  |   angle=0   |
-                |  D------w------C
-                v
-                y (pi/2 rad)
-            In such coordination system the rotation matrix is
-            .. math::
-                \\begin{pmatrix}
-                \\cos\\alpha & -\\sin\\alpha \\\\
-                \\sin\\alpha & \\cos\\alpha
-                \\end{pmatrix}
-            The coordinates of the corner point A can be calculated as:
-            .. math::
-                P_A=
-                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
-                =
-                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
-                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
-                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
-                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
-                =
-                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
-                \\\\
-                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
-        The coordinate system when ``clockwise`` is ``False``
-            .. code-block:: none
-                0-------------------> x (0 rad)
-                |  A-------------B
-                |  |             |
-                |  |     box     h
-                |  |   angle=0   |
-                |  D------w------C
-                v
-                y (-pi/2 rad)
-            In such coordination system the rotation matrix is
-            .. math::
-                \\begin{pmatrix}
-                \\cos\\alpha & \\sin\\alpha \\\\
-                -\\sin\\alpha & \\cos\\alpha
-                \\end{pmatrix}
-            The coordinates of the corner point A can be calculated as:
-            .. math::
-                P_A=
-                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
-                =
-                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
-                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
-                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
-                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
-                =
-                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
-                \\\\
-                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
-    Args:
-        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
-            indicating (x, y, w, h, theta) for each row. Note that theta is in
-            radian.
-        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
-            indicating (x, y, w, h, theta) for each row. Note that theta is in
-            radian.
        mode (str): "iou" (intersection over union) or iof (intersection over
            foreground).
-        clockwise (bool): flag indicating whether the positive angular
-            orientation is clockwise. default True.
-            `New in version 1.4.3.`
    Returns:
-        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ious(Tensor): shape (N, M) if aligned == False else shape (N,)
-        ``False``, the shape of ious is (N, M) else (N,).
    """
    assert mode in ['iou', 'iof']
    mode_dict = {'iou': 0, 'iof': 1}
@@ -133,12 +35,7 @@ def box_iou_rotated(bboxes1: torch.Tensor,
    if aligned:
        ious = bboxes1.new_zeros(rows)
    else:
-        ious = bboxes1.new_zeros(rows * cols)
+        ious = bboxes1.new_zeros((rows * cols))
-    if not clockwise:
-        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
-        flip_mat[-1] = -1
-        bboxes1 = bboxes1 * flip_mat
-        bboxes2 = bboxes2 * flip_mat
    bboxes1 = bboxes1.contiguous()
    bboxes2 = bboxes2.contiguous()
    ext_module.box_iou_rotated(

--- a/mmcv/ops/carafe.py
+++ b/mmcv/ops/carafe.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.model import normal_init, xavier_init
-from mmengine.registry import MODELS
-from torch import Tensor
 from torch.autograd import Function
 from torch.nn.modules.module import Module
+from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
 from ..utils import ext_loader
 ext_module = ext_loader.load_ext('_ext', [
@@ -21,8 +17,7 @@ ext_module = ext_loader.load_ext('_ext', [
 class CARAFENaiveFunction(Function):
    @staticmethod
-    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
-                 group_size: int, scale_factor: int) -> Tensor:
        return g.op(
            'mmcv::MMCVCARAFENaive',
            features,
@@ -32,8 +27,7 @@ class CARAFENaiveFunction(Function):
            scale_factor_f=scale_factor)
    @staticmethod
-    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
-                group_size: int, scale_factor: int) -> Tensor:
        assert scale_factor >= 1
        assert masks.size(1) == kernel_size * kernel_size * group_size
        assert masks.size(-1) == features.size(-1) * scale_factor
@@ -56,15 +50,12 @@ class CARAFENaiveFunction(Function):
            group_size=group_size,
            scale_factor=scale_factor)
-        if features.requires_grad or masks.requires_grad or \
+        if features.requires_grad or masks.requires_grad:
-                torch.__version__ == 'parrots':
            ctx.save_for_backward(features, masks)
        return output
    @staticmethod
-    def backward(
+    def backward(ctx, grad_output):
-            ctx,
-            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
        assert grad_output.is_cuda
        features, masks = ctx.saved_tensors
@@ -92,8 +83,8 @@ carafe_naive = CARAFENaiveFunction.apply
 class CARAFENaive(Module):
-    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+    def __init__(self, kernel_size, group_size, scale_factor):
-        super().__init__()
+        super(CARAFENaive, self).__init__()
        assert isinstance(kernel_size, int) and isinstance(
            group_size, int) and isinstance(scale_factor, int)
@@ -101,7 +92,7 @@ class CARAFENaive(Module):
        self.group_size = group_size
        self.scale_factor = scale_factor
-    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+    def forward(self, features, masks):
        return carafe_naive(features, masks, self.kernel_size, self.group_size,
                            self.scale_factor)
@@ -109,8 +100,7 @@ class CARAFENaive(Module):
 class CARAFEFunction(Function):
    @staticmethod
-    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
-                 group_size: int, scale_factor: int) -> Tensor:
        return g.op(
            'mmcv::MMCVCARAFE',
            features,
@@ -120,8 +110,7 @@ class CARAFEFunction(Function):
            scale_factor_f=scale_factor)
    @staticmethod
-    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
-                group_size: int, scale_factor: int) -> Tensor:
        assert scale_factor >= 1
        assert masks.size(1) == kernel_size * kernel_size * group_size
        assert masks.size(-1) == features.size(-1) * scale_factor
@@ -150,15 +139,14 @@ class CARAFEFunction(Function):
            group_size=group_size,
            scale_factor=scale_factor)
-        if features.requires_grad or masks.requires_grad or \
+        if features.requires_grad or masks.requires_grad:
-                torch.__version__ == 'parrots':
            ctx.save_for_backward(features, masks, rfeatures)
        return output
    @staticmethod
-    def backward(
+    def backward(ctx, grad_output):
-            ctx,
+        assert grad_output.is_cuda
-            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
        features, masks, rfeatures = ctx.saved_tensors
        kernel_size = ctx.kernel_size
        group_size = ctx.group_size
@@ -192,8 +180,7 @@ carafe = CARAFEFunction.apply
 class CARAFE(Module):
    """ CARAFE: Content-Aware ReAssembly of FEatures
-    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
-    <https://arxiv.org/abs/1905.02188>`_ for more details.
    Args:
        kernel_size (int): reassemble kernel size
@@ -204,8 +191,8 @@ class CARAFE(Module):
        upsampled feature map
    """
-    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+    def __init__(self, kernel_size, group_size, scale_factor):
-        super().__init__()
+        super(CARAFE, self).__init__()
        assert isinstance(kernel_size, int) and isinstance(
            group_size, int) and isinstance(scale_factor, int)
@@ -213,19 +200,19 @@ class CARAFE(Module):
        self.group_size = group_size
        self.scale_factor = scale_factor
-    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+    def forward(self, features, masks):
        return carafe(features, masks, self.kernel_size, self.group_size,
                      self.scale_factor)
-@MODELS.register_module(name='carafe')
+@UPSAMPLE_LAYERS.register_module(name='carafe')
 class CARAFEPack(nn.Module):
    """A unified package of CARAFE upsampler that contains: 1) channel
    compressor 2) content encoder 3) CARAFE op.
    Official implementation of ICCV 2019 paper
-    `CARAFE: Content-Aware ReAssembly of FEatures
+    CARAFE: Content-Aware ReAssembly of FEatures
-    <https://arxiv.org/abs/1905.02188>`_.
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
    Args:
        channels (int): input feature channels
@@ -241,14 +228,14 @@ class CARAFEPack(nn.Module):
    """
    def __init__(self,
-                 channels: int,
+                 channels,
-                 scale_factor: int,
+                 scale_factor,
-                 up_kernel: int = 5,
+                 up_kernel=5,
-                 up_group: int = 1,
+                 up_group=1,
-                 encoder_kernel: int = 3,
+                 encoder_kernel=3,
-                 encoder_dilation: int = 1,
+                 encoder_dilation=1,
-                 compressed_channels: int = 64):
+                 compressed_channels=64):
-        super().__init__()
+        super(CARAFEPack, self).__init__()
        self.channels = channels
        self.scale_factor = scale_factor
        self.up_kernel = up_kernel
@@ -274,7 +261,7 @@ class CARAFEPack(nn.Module):
                xavier_init(m, distribution='uniform')
        normal_init(self.content_encoder, std=0.001)
-    def kernel_normalizer(self, mask: Tensor) -> Tensor:
+    def kernel_normalizer(self, mask):
        mask = F.pixel_shuffle(mask, self.scale_factor)
        n, mask_c, h, w = mask.size()
        # use float division explicitly,
@@ -287,11 +274,11 @@ class CARAFEPack(nn.Module):
        return mask
-    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
+    def feature_reassemble(self, x, mask):
        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
        return x
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x):
        compressed_x = self.channel_compressor(x)
        mask = self.content_encoder(compressed_x)
        mask = self.kernel_normalizer(mask)

--- a/mmcv/ops/cc_attention.py
+++ b/mmcv/ops/cc_attention.py
@@ -2,12 +2,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.registry import MODELS
-from mmcv.cnn import Scale
+from mmcv.cnn import PLUGIN_LAYERS, Scale
-def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
+def NEG_INF_DIAG(n, device):
    """Returns a diagonal matrix of size [n, n].
    The diagonal are all "-inf". This is for avoiding calculating the
@@ -16,7 +15,7 @@ def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
-@MODELS.register_module()
+@PLUGIN_LAYERS.register_module()
 class CrissCrossAttention(nn.Module):
    """Criss-Cross Attention Module.
@@ -42,7 +41,7 @@ class CrissCrossAttention(nn.Module):
        in_channels (int): Channels of the input feature map.
    """
-    def __init__(self, in_channels: int) -> None:
+    def __init__(self, in_channels):
        super().__init__()
        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
@@ -50,15 +49,14 @@ class CrissCrossAttention(nn.Module):
        self.gamma = Scale(0.)
        self.in_channels = in_channels
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
        """forward function of Criss-Cross Attention.
        Args:
-            x (torch.Tensor): Input feature with the shape of
+            x (Tensor): Input feature. \
-                (batch_size, in_channels, height, width).
+                shape (batch_size, in_channels, height, width)
        Returns:
-            torch.Tensor: Output of the layer, with the shape of
+            Tensor: Output of the layer, with shape of \
            (batch_size, in_channels, height, width)
        """
        B, C, H, W = x.size()
@@ -79,7 +77,7 @@ class CrissCrossAttention(nn.Module):
        return out
-    def __repr__(self) -> str:
+    def __repr__(self):
        s = self.__class__.__name__
        s += f'(in_channels={self.in_channels})'
        return s
--- a/mmcv/ops/chamfer_distance.py
+++ b/mmcv/ops/chamfer_distance.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Sequence, Tuple
-import torch
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from ..utils import ext_loader
-ext_module = ext_loader.load_ext(
-    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
-class ChamferDistanceFunction(Function):
-    """This is an implementation of the 2D Chamfer Distance.
-    It has been used in the paper `Oriented RepPoints for Aerial Object
-    Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
-    """
-    @staticmethod
-    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
-        """
-        Args:
-            xyz1 (Tensor): Point set with shape (B, N, 2).
-            xyz2 (Tensor): Point set with shape (B, N, 2).
-        Returns:
-            Sequence[Tensor]:
-                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
-                    shape (B, N).
-                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
-                    shape (B, N).
-                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
-                    with shape (B, N), which be used in compute gradient.
-                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
-                    with shape (B, N), which be used in compute gradient.
-        """
-        batch_size, n, _ = xyz1.size()
-        _, m, _ = xyz2.size()
-        device = xyz1.device
-        xyz1 = xyz1.contiguous()
-        xyz2 = xyz2.contiguous()
-        dist1 = torch.zeros(batch_size, n).to(device)
-        dist2 = torch.zeros(batch_size, m).to(device)
-        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
-        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
-        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
-                                            idx2)
-        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
-        return dist1, dist2, idx1, idx2
-    @staticmethod
-    @once_differentiable
-    def backward(ctx,
-                 grad_dist1: Tensor,
-                 grad_dist2: Tensor,
-                 grad_idx1=None,
-                 grad_idx2=None) -> Tuple[Tensor, Tensor]:
-        """
-        Args:
-            grad_dist1 (Tensor): Gradient of chamfer distance
-                (xyz1 to xyz2) with shape (B, N).
-            grad_dist2 (Tensor): Gradient of chamfer distance
-                (xyz2 to xyz1) with shape (B, N).
-        Returns:
-            Tuple[Tensor, Tensor]:
-            - grad_xyz1 (Tensor): Gradient of the point set with shape \
-                (B, N, 2).
-            - grad_xyz2 (Tensor):Gradient of the point set with shape \
-                (B, N, 2).
-        """
-        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
-        device = grad_dist1.device
-        grad_dist1 = grad_dist1.contiguous()
-        grad_dist2 = grad_dist2.contiguous()
-        grad_xyz1 = torch.zeros(xyz1.size()).to(device)
-        grad_xyz2 = torch.zeros(xyz2.size()).to(device)
-        ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,
-                                             grad_dist1, grad_dist2, grad_xyz1,
-                                             grad_xyz2)
-        return grad_xyz1, grad_xyz2
-chamfer_distance = ChamferDistanceFunction.apply
--- a/mmcv/ops/contour_expand.py
+++ b/mmcv/ops/contour_expand.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Union
 import numpy as np
 import torch
@@ -9,22 +7,21 @@ from ..utils import ext_loader
 ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
-def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
+def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
-                   internal_kernel_label: Union[np.array, torch.Tensor],
+                   kernel_num):
-                   min_kernel_area: int, kernel_num: int) -> list:
    """Expand kernel contours so that foreground pixels are assigned into
    instances.
-    Args:
+    Arguments:
-        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
+        kernel_mask (np.array or Tensor): The instance kernel mask with
            size hxw.
-        internal_kernel_label (np.array or torch.Tensor): The instance internal
+        internal_kernel_label (np.array or Tensor): The instance internal
            kernel label with size hxw.
        min_kernel_area (int): The minimum kernel area.
        kernel_num (int): The instance kernel number.
    Returns:
-        list: The instance index map with size hxw.
+        label (list): The instance index map with size hxw.
    """
    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
@@ -45,7 +42,7 @@ def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
                internal_kernel_label,
                min_kernel_area=min_kernel_area,
                kernel_num=kernel_num)
-            label = label.tolist()  # type: ignore
+            label = label.tolist()
    else:
        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
                                          min_kernel_area, kernel_num)

--- a/mmcv/ops/conv2d_gradfix.py
+++ b/mmcv/ops/conv2d_gradfix.py
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa
-"""Custom replacement for `torch.nn.functional.conv2d` that supports
-arbitrarily high order gradients with zero performance penalty."""
-import contextlib
-import warnings
-from typing import Dict, Optional, Tuple, Union
-import torch
-from mmengine.utils import digit_version
-enabled = True
-weight_gradients_disabled = False
-@contextlib.contextmanager
-def no_weight_gradients(disable=True):
-    global weight_gradients_disabled
-    old = weight_gradients_disabled
-    if disable:
-        weight_gradients_disabled = True
-    yield
-    weight_gradients_disabled = old
-def conv2d(input: torch.Tensor,
-           weight: torch.Tensor,
-           bias: Optional[torch.Tensor] = None,
-           stride: Union[int, Tuple[int, ...]] = 1,
-           padding: Union[int, Tuple[int, ...]] = 0,
-           dilation: Union[int, Tuple[int, ...]] = 1,
-           groups: int = 1):
-    flag = True
-    if digit_version(torch.__version__) >= digit_version('1.10.0'):
-        warnings.warn('Since '
-                      'aten:cudnn_convolution_backward_weight is '
-                      f'not supported in torch=={torch.__version__},'
-                      ' rolling back to `torch.nn.functional.conv2d`')
-        flag = False
-    if _should_use_custom_op(input) and flag:
-        return _conv2d_gradfix(
-            transpose=False,
-            weight_shape=weight.shape,
-            stride=stride,
-            padding=padding,
-            output_padding=0,
-            dilation=dilation,
-            groups=groups).apply(input, weight, bias)
-    return torch.nn.functional.conv2d(
-        input=input,
-        weight=weight,
-        bias=bias,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        groups=groups)
-def conv_transpose2d(input: torch.Tensor,
-                     weight: torch.Tensor,
-                     bias: Optional[torch.Tensor] = None,
-                     stride: Union[int, Tuple[int, ...]] = 1,
-                     padding: Union[int, Tuple[int, ...]] = 0,
-                     output_padding: Union[int, Tuple[int, ...]] = 0,
-                     groups: int = 1,
-                     dilation: Union[int, Tuple[int, ...]] = 1):
-    if _should_use_custom_op(input):
-        return _conv2d_gradfix(
-            transpose=True,
-            weight_shape=weight.shape,
-            stride=stride,
-            padding=padding,
-            output_padding=output_padding,
-            groups=groups,
-            dilation=dilation).apply(input, weight, bias)
-    return torch.nn.functional.conv_transpose2d(
-        input=input,
-        weight=weight,
-        bias=bias,
-        stride=stride,
-        padding=padding,
-        output_padding=output_padding,
-        groups=groups,
-        dilation=dilation)
-def _should_use_custom_op(input):
-    assert isinstance(input, torch.Tensor)
-    if (not enabled) or (not torch.backends.cudnn.enabled):
-        return False
-    if input.device.type != 'cuda':
-        return False
-    return True
-def _to_tuple(x, ndim):
-    xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim
-    assert len(xs) == ndim
-    assert all(isinstance(x, int) for x in xs)
-    return xs
-_conv2d_gradfix_cache: Dict = dict()
-_null_tensor = torch.empty([0])
-def _conv2d_gradfix(
-    transpose: bool,
-    weight_shape: Tuple[int, ...],
-    stride: Union[int, Tuple[int, ...]],
-    padding: Union[int, Tuple[int, ...]],
-    output_padding: Union[int, Tuple[int, ...]],
-    dilation: Union[int, Tuple[int, ...]],
-    groups: int,
-):
-    # Parse arguments.
-    ndim = 2
-    weight_shape = tuple(weight_shape)
-    stride = _to_tuple(stride, ndim)
-    padding = _to_tuple(padding, ndim)
-    output_padding = _to_tuple(output_padding, ndim)
-    dilation = _to_tuple(dilation, ndim)
-    # Lookup from cache.
-    key = (transpose, weight_shape, stride, padding, output_padding, dilation,
-           groups)
-    if key in _conv2d_gradfix_cache:
-        return _conv2d_gradfix_cache[key]
-    # Validate arguments.
-    assert groups >= 1
-    assert len(weight_shape) == ndim + 2
-    assert all(stride[i] >= 1 for i in range(ndim))  # type: ignore
-    assert all(padding[i] >= 0 for i in range(ndim))  # type: ignore
-    assert all(dilation[i] >= 0 for i in range(ndim))  # type: ignore
-    if not transpose:
-        assert all(output_padding[i] == 0 for i in range(ndim))  # type: ignore
-    else:  # transpose
-        for i in range(ndim):
-            assert 0 <= output_padding[i] < max(  # type: ignore
-                stride[i],  # type: ignore
-                dilation[i])  # type: ignore
-    # Helpers.
-    common_kwargs = dict(
-        stride=stride, padding=padding, dilation=dilation, groups=groups)
-    def calc_output_padding(input_shape, output_shape):
-        if transpose:
-            return [0, 0]
-        return [
-            input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -
-            (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
-            for i in range(ndim)
-        ]
-    # Forward & backward.
-    class Conv2d(torch.autograd.Function):
-        @staticmethod
-        def forward(ctx, input, weight, bias):
-            assert weight.shape == weight_shape
-            ctx.save_for_backward(
-                input if weight.requires_grad else _null_tensor,
-                weight if input.requires_grad else _null_tensor,
-            )
-            ctx.input_shape = input.shape
-            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
-            if weight_shape[2:] == stride == dilation == (
-                    1, 1) and padding == (
-                        0, 0) and torch.cuda.get_device_capability(
-                            input.device) < (8, 0):
-                a = weight.reshape(groups, weight_shape[0] // groups,
-                                   weight_shape[1])
-                b = input.reshape(input.shape[0], groups,
-                                  input.shape[1] // groups, -1)
-                c = (a.transpose(1, 2) if transpose else a) @ b.permute(
-                    1, 2, 0, 3).flatten(2)
-                c = c.reshape(-1, input.shape[0],
-                              *input.shape[2:]).transpose(0, 1)
-                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(
-                    2).unsqueeze(3)
-                return c.contiguous(
-                    memory_format=(torch.channels_last if input.stride(1) ==
-                                   1 else torch.contiguous_format))
-            # General case => cuDNN.
-            if transpose:
-                return torch.nn.functional.conv_transpose2d(
-                    input=input,
-                    weight=weight,
-                    bias=bias,
-                    output_padding=output_padding,
-                    **common_kwargs)
-            return torch.nn.functional.conv2d(
-                input=input, weight=weight, bias=bias, **common_kwargs)
-        @staticmethod
-        def backward(ctx, grad_output):
-            input, weight = ctx.saved_tensors
-            input_shape = ctx.input_shape
-            grad_input = None
-            grad_weight = None
-            grad_bias = None
-            if ctx.needs_input_grad[0]:
-                p = calc_output_padding(
-                    input_shape=input_shape, output_shape=grad_output.shape)
-                op = _conv2d_gradfix(
-                    transpose=(not transpose),
-                    weight_shape=weight_shape,
-                    output_padding=p,
-                    **common_kwargs)
-                grad_input = op.apply(grad_output, weight, None)
-                assert grad_input.shape == input_shape
-            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
-                grad_weight = Conv2dGradWeight.apply(grad_output, input)
-                assert grad_weight.shape == weight_shape
-            if ctx.needs_input_grad[2]:
-                grad_bias = grad_output.sum([0, 2, 3])
-            return grad_input, grad_weight, grad_bias
-    # Gradient with respect to the weights.
-    class Conv2dGradWeight(torch.autograd.Function):
-        @staticmethod
-        def forward(ctx, grad_output, input):
-            ctx.save_for_backward(
-                grad_output if input.requires_grad else _null_tensor,
-                input if grad_output.requires_grad else _null_tensor,
-            )
-            ctx.grad_output_shape = grad_output.shape
-            ctx.input_shape = input.shape
-            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
-            if weight_shape[2:] == stride == dilation == (
-                    1, 1) and padding == (0, 0):
-                a = grad_output.reshape(grad_output.shape[0], groups,
-                                        grad_output.shape[1] // groups,
-                                        -1).permute(1, 2, 0, 3).flatten(2)
-                b = input.reshape(input.shape[0], groups,
-                                  input.shape[1] // groups,
-                                  -1).permute(1, 2, 0, 3).flatten(2)
-                c = (b @ a.transpose(1, 2) if transpose else
-                     a @ b.transpose(1, 2)).reshape(weight_shape)
-                return c.contiguous(
-                    memory_format=(torch.channels_last if input.stride(1) ==
-                                   1 else torch.contiguous_format))
-            # PyTorch consolidated convolution backward API in PR:
-            # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122  # noqa: E501
-            # Enhance the code referring to the discussion:
-            # https://github.com/pytorch/pytorch/issues/74437
-            if digit_version(torch.__version__) >= digit_version('1.11.0'):
-                empty_weight = torch.tensor(
-                    0.0, dtype=input.dtype,
-                    device=input.device).expand(weight_shape)
-                output_padding = calc_output_padding(input.shape,
-                                                     grad_output.shape)
-                return torch.ops.aten.convolution_backward(
-                    grad_output,
-                    input,
-                    empty_weight,
-                    None,
-                    stride=stride,
-                    dilation=dilation,
-                    transposed=transpose,
-                    padding=padding,
-                    groups=groups,
-                    output_padding=output_padding,
-                    output_mask=[0, 1, 0])[1]
-            else:
-                is_rocm_pytorch = False
-                try:
-                    from torch.utils.cpp_extension import ROCM_HOME
-                    is_rocm_pytorch = True if ((torch.version.hip is not None) and
-                                       (ROCM_HOME is not None)) else False
-                except ImportError:
-                    pass
-                name=''
-                flags=[]
-                if is_rocm_pytorch:
-                    name = ('aten::miopen_convolution_transpose_backward_weight'
-                        if transpose else
-                        'aten::miopen_convolution_backward_weight')
-                    flags = [
-                        torch.backends.cudnn.benchmark,
-                        torch.backends.cudnn.deterministic
-                    ]
-                else:
-                # General case => cuDNN.
-                    name = ('aten::cudnn_convolution_transpose_backward_weight'
-                        if transpose else
-                        'aten::cudnn_convolution_backward_weight')
-                    flags = [
-                        torch.backends.cudnn.benchmark,
-                        torch.backends.cudnn.deterministic,
-                        torch.backends.cudnn.allow_tf32
-                    ]
-                return torch._C._jit_get_operation(name)(weight_shape,
-                                                         grad_output, input,
-                                                         padding, stride,
-                                                         dilation, groups,
-                                                         *flags)
-        @staticmethod
-        def backward(ctx, grad2_grad_weight):
-            grad_output, input = ctx.saved_tensors
-            grad_output_shape = ctx.grad_output_shape
-            input_shape = ctx.input_shape
-            grad2_grad_output = None
-            grad2_input = None
-            if ctx.needs_input_grad[0]:
-                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,
-                                                 None)
-                assert grad2_grad_output.shape == grad_output_shape
-            if ctx.needs_input_grad[1]:
-                p = calc_output_padding(
-                    input_shape=input_shape, output_shape=grad_output_shape)
-                op = _conv2d_gradfix(
-                    transpose=(not transpose),
-                    weight_shape=weight_shape,
-                    output_padding=p,
-                    **common_kwargs)
-                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
-                assert grad2_input.shape == input_shape
-            return grad2_grad_output, grad2_input
-    _conv2d_gradfix_cache[key] = Conv2d
-    return Conv2d
--- a/mmcv/ops/convex_iou.py
+++ b/mmcv/ops/convex_iou.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-import torch
-from ..utils import ext_loader
-ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
-def convex_giou(pointsets: torch.Tensor,
-                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Return generalized intersection-over-union (Jaccard index) between point
-    sets and polygons.
-    Args:
-        pointsets (torch.Tensor): It has shape (N, 18),
-            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
-        polygons (torch.Tensor): It has shape (N, 8),
-            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
-    Returns:
-        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
-        between point sets and polygons with the shape (N,). The second
-        element is the gradient of point sets with the shape (N, 18).
-    """
-    output = pointsets.new_zeros((pointsets.size(0), 19))
-    ext_module.convex_giou(pointsets, polygons, output)
-    convex_giou = output[:, -1]
-    points_grad = output[:, 0:-1]
-    return convex_giou, points_grad
-def convex_iou(pointsets: torch.Tensor,
-               polygons: torch.Tensor) -> torch.Tensor:
-    """Return intersection-over-union (Jaccard index) between point sets and
-    polygons.
-    Args:
-        pointsets (torch.Tensor): It has shape (N, 18),
-            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
-        polygons (torch.Tensor): It has shape (K, 8),
-            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
-    Returns:
-        torch.Tensor: Return the ious between point sets and polygons with the
-        shape (N, K).
-    """
-    N, K = pointsets.size(0), polygons.size(0)
-    ious = pointsets.new_zeros((N, K))
-    ext_module.convex_iou(pointsets, polygons, ious)
-    return ious
--- a/mmcv/ops/corner_pool.py
+++ b/mmcv/ops/corner_pool.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
-from torch import Tensor, nn
+from torch import nn
-from mmengine.utils import digit_version
+from torch.autograd import Function
+from ..utils import ext_loader
+ext_module = ext_loader.load_ext('_ext', [
+    'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
+    'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
+    'right_pool_forward', 'right_pool_backward'
+])
 _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
-def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
+class TopPoolFunction(Function):
-    size = x.size(dim)
-    output = x.clone()
-    ind = 1
+    @staticmethod
-    while ind < size:
+    def symbolic(g, input):
-        if flip:
+        output = g.op(
-            cur_start = 0
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
-            cur_len = size - ind
+        return output
-            next_start = ind
-            next_len = size - ind
+    @staticmethod
-        else:
+    def forward(ctx, input):
-            cur_start = ind
+        output = ext_module.top_pool_forward(input)
-            cur_len = size - ind
+        ctx.save_for_backward(input)
-            next_start = 0
+        return output
-            next_len = size - ind
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.top_pool_backward(input, grad_output)
+        return output
+class BottomPoolFunction(Function):
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
+        return output
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.bottom_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.bottom_pool_backward(input, grad_output)
+        return output
-        # max_temp should be cloned for backward computation
-        max_temp = output.narrow(dim, cur_start, cur_len).clone()
-        cur_temp = output.narrow(dim, cur_start, cur_len)
-        next_temp = output.narrow(dim, next_start, next_len)
-        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
+class LeftPoolFunction(Function):
-        ind = ind << 1
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
+        return output
-    return output
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.left_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.left_pool_backward(input, grad_output)
+        return output
+class RightPoolFunction(Function):
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
+        return output
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.right_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.right_pool_backward(input, grad_output)
+        return output
 class CornerPool(nn.Module):
@@ -40,13 +104,11 @@ class CornerPool(nn.Module):
    Corner Pooling is a new type of pooling layer that helps a
    convolutional network better localize corners of bounding boxes.
-    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
+    Please refer to https://arxiv.org/abs/1808.01244 for more details.
-    <https://arxiv.org/abs/1808.01244>`_ for more details.
    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
    Args:
-        mode (str): Pooling orientation for the pooling layer
+        mode(str): Pooling orientation for the pooling layer
            - 'bottom': Bottom Pooling
            - 'left': Left Pooling
@@ -57,6 +119,13 @@ class CornerPool(nn.Module):
        Feature map after pooling.
    """
+    pool_functions = {
+        'bottom': BottomPoolFunction,
+        'left': LeftPoolFunction,
+        'right': RightPoolFunction,
+        'top': TopPoolFunction,
+    }
    cummax_dim_flip = {
        'bottom': (2, False),
        'left': (3, True),
@@ -64,13 +133,23 @@ class CornerPool(nn.Module):
        'top': (2, True),
    }
-    def __init__(self, mode: str):
+    def __init__(self, mode):
-        super().__init__()
+        super(CornerPool, self).__init__()
-        assert mode in self.cummax_dim_flip
+        assert mode in self.pool_functions
        self.mode = mode
+        self.corner_pool = self.pool_functions[mode]
+    def forward(self, x):
+        if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
+            if torch.onnx.is_in_onnx_export():
+                assert torch.__version__ >= '1.7.0', \
+                    'When `cummax` serves as an intermediate component whose '\
+                    'outputs is used as inputs for another modules, it\'s '\
+                    'expected that pytorch version must be >= 1.7.0, '\
+                    'otherwise Error appears like: `RuntimeError: tuple '\
+                    'appears in op that does not forward tuples, unsupported '\
+                    'kind: prim::PythonOp`.'
-    def forward(self, x: Tensor) -> Tensor:
-        if torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.5.0'):
            dim, flip = self.cummax_dim_flip[self.mode]
            if flip:
                x = x.flip(dim)
@@ -79,5 +158,4 @@ class CornerPool(nn.Module):
                pool_tensor = pool_tensor.flip(dim)
            return pool_tensor
        else:
-            dim, flip = self.cummax_dim_flip[self.mode]
+            return self.corner_pool.apply(x)
-            return _corner_pool(x, dim, flip)
--- a/mmcv/ops/correlation.py
+++ b/mmcv/ops/correlation.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 import torch
 from torch import Tensor, nn
 from torch.autograd import Function
@@ -17,14 +15,14 @@ class CorrelationFunction(Function):
    @staticmethod
    def forward(ctx,
-                input1: Tensor,
+                input1,
-                input2: Tensor,
+                input2,
-                kernel_size: int = 1,
+                kernel_size=1,
-                max_displacement: int = 1,
+                max_displacement=1,
-                stride: int = 1,
+                stride=1,
-                padding: int = 1,
+                padding=1,
-                dilation: int = 1,
+                dilation=1,
-                dilation_patch: int = 1) -> Tensor:
+                dilation_patch=1):
        ctx.save_for_backward(input1, input2)
@@ -62,9 +60,7 @@ class CorrelationFunction(Function):
    @staticmethod
    @once_differentiable
-    def backward(
+    def backward(ctx, grad_output):
-        ctx, grad_output: Tensor
-    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
        input1, input2 = ctx.saved_tensors
        kH, kW = ctx.kernel_size

--- a/mmcv/ops/csrc/README.md
+++ b/mmcv/ops/csrc/README.md
@@ -13,150 +13,158 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── pytorch_cpp_helper.hpp
 │   ├── pytorch_cuda_helper.hpp
 │   ├── pytorch_device_registry.hpp
-│   ├── cuda
+│   └── cuda
-│   │   ├── common_cuda_helper.hpp
+│       ├── common_cuda_helper.hpp
-│   │   ├── parrots_cudawarpfunction.cuh
+│       ├── parrots_cudawarpfunction.cuh
-│   │   ├── ...
+│       ├── ...
-│   │   └── ops_cuda_kernel.cuh
+│       └── ops_cuda_kernel.cuh
-|   ├── mps
+├── onnxruntime
-│   │   ├── MPSLibrary.h
+│   ├── onnxruntime_register.h
-│   │   ├── ...
+│   ├── onnxruntime_session_options_config_keys.h
-│   │   └── MPSUtils.h
+│   ├── ort_mmcv_utils.h
-|   ├── mlu
+│   ├── ...
-│   │   └── ...
+│   ├── onnx_ops.h
-|   └── utils
+│   └── cpu
-│   │   └── ...
+│       ├── onnxruntime_register.cpp
+│       ├── ...
+│       └── onnx_ops_impl.cpp
 ├── parrots
 │   ├── ...
 │   ├── ops.cpp
 │   ├── ops_parrots.cpp
 │   └── ops_pytorch.h
-└── pytorch
+├── pytorch
-    ├── info.cpp
+│   ├── info.cpp
-    ├── pybind.cpp
+│   ├── pybind.cpp
-    ├── ...
+│   ├── ...
-    ├── ops.cpp
+│   ├── ops.cpp
-    ├── cuda
+│   ├── cuda
-    │   ├── ...
+│   │   ├── ...
-    │   └── ops_cuda.cu
+│   │   └── ops_cuda.cu
-    ├── cpu
+│   └── cpu
-    │   ├── ...
+│       ├── ...
-    │   └── ops.cpp
+│       └── ops.cpp
-    ├── mps
+└── tensorrt
-    │   ├── ...
+    ├── trt_cuda_helper.cuh
-    |   └── op_mps.mm
+    ├── trt_plugin_helper.hpp
-    └── mlu
+    ├── trt_plugin.hpp
-        ├── ...
+    ├── trt_serialize.hpp
-        └── op_mlu.cpp
+    ├── ...
+    ├── trt_ops.hpp
+    └── plugins
+        ├── trt_cuda_helper.cu
+        ├── trt_plugin.cpp
+        ├── ...
+        ├── trt_ops.cpp
+        └── trt_ops_kernel.cu
 ```
 ## Components
 - `common`: This directory contains all tools and shared codes.
  - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
-  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
+- `onnxruntime`: **ONNX Runtime** support for custom ops.
-  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
+  - `cpu`: CPU implementation of supported ops.
-  - `utils`: The kernels and utils of spconv.
 - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
 - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
-  - `mlu`: This directory contain launchers of each MLU kernels.
+- `tensorrt`: **TensorRT** support for custom ops.
-  - `mps`: MPS ops implementation and launchers.
+  - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
 ## How to add new PyTorch ops?
 1. (Optional) Add shared kernel in `common` to support special hardware platform.
-   ```c++
+    ```c++
-   // src/common/cuda/new_ops_cuda_kernel.cuh
+    // src/common/cuda/new_ops_cuda_kernel.cuh
-   template <typename T>
+    template <typename T>
-   __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
+    __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
-       // forward here
+        // forward here
-   }
+    }
-   ```
+    ```
-   Add cuda kernel launcher in `pytorch/cuda`.
+    Add cuda kernel launcher in `pytorch/cuda`.
-   ```c++
+    ```c++
-   // src/pytorch/cuda
+    // src/pytorch/cuda
-   #include <new_ops_cuda_kernel.cuh>
+    #include <new_ops_cuda_kernel.cuh>
-   void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
+    void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
-       // initialize
+        // initialize
-       at::cuda::CUDAGuard device_guard(input.device());
+        at::cuda::CUDAGuard device_guard(input.device());
-       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-       ...
+        ...
-       AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-           input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
+            input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
-               new_ops_forward_cuda_kernel<scalar_t>
+                new_ops_forward_cuda_kernel<scalar_t>
-                   <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                    <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                       input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
+                        input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
-           }));
+            }));
-       AT_CUDA_CHECK(cudaGetLastError());
+        AT_CUDA_CHECK(cudaGetLastError());
-   }
+    }
-   ```
+    ```
 2. Register implementation for different devices.
-   ```c++
+    ```c++
-   // src/pytorch/cuda/cudabind.cpp
+    // src/pytorch/cuda/cudabind.cpp
-   ...
+    ...
-   Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
+    Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
-       // implement cuda forward here
+        // implement cuda forward here
-       // use `NewOpsForwardCUDAKernelLauncher` here
+        // use `NewOpsForwardCUDAKernelLauncher` here
-   }
+    }
-   // declare interface here.
+    // declare interface here.
-   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
-   // register the implementation for given device (CUDA here).
+    // register the implementation for given device (CUDA here).
-   REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+    REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
-   ```
+    ```
 3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
-   ```c++
+    ```c++
-   // src/pytorch/new_ops.cpp
+    // src/pytorch/new_ops.cpp
-   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
-       // dispatch the implementation according to the device type of input.
+        // dispatch the implementation according to the device type of input.
-       DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+        DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
-   }
+    }
-   ...
+    ...
-   Tensor new_ops_forward(Tensor input, Tensor output, ...){
+    Tensor new_ops_forward(Tensor input, Tensor output, ...){
-       return new_ops_forward_impl(input, output, ...);
+        return new_ops_forward_impl(input, output, ...);
-   }
+    }
-   ```
+    ```
 4. Binding the implementation in `pytorch/pybind.cpp`
-   ```c++
+    ```c++
-   // src/pytorch/pybind.cpp
+    // src/pytorch/pybind.cpp
-   ...
+    ...
-   Tensor new_ops_forward(Tensor input, Tensor output, ...);
+    Tensor new_ops_forward(Tensor input, Tensor output, ...);
-   ...
+    ...
-   // bind with pybind11
+    // bind with pybind11
-   m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
+    m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
-           py::arg("input"), py::arg("output"), ...);
+            py::arg("input"), py::arg("output"), ...);
-   ...
+    ...
-   ```
+    ```
 5. Build MMCV again. Enjoy new ops in python
-   ```python
+    ```python
-   from ..utils import ext_loader
+    from ..utils import ext_loader
-   ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
+    ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
-   ...
+    ...
-   ext_module.new_ops_forward(input, output, ...)
+    ext_module.new_ops_forward(input, output, ...)
-   ```
+    ```
--- a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
+++ b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -220,10 +220,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
                return temp > 0;
              }
            });
-  // compute distance to origin after sort, since the points are now different.
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d<T>(q[i], q[i]);
-  }
 #endif
  // Step 4:
@@ -270,17 +266,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
  return m;
 }
-template <typename T>
-HOST_DEVICE_INLINE T quadri_box_area(const Point<T> (&q)[4]) {
-  T area = 0;
-#pragma unroll
-  for (int i = 1; i < 3; i++) {
-    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
-  }
-  return area / 2.0;
-}
 template <typename T>
 HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
  if (m <= 2) {
@@ -319,25 +304,6 @@ HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
  return polygon_area<T>(orderedPts, num_convex);
 }
-template <typename T>
-HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point<T> (&pts1)[4],
-                                               const Point<T> (&pts2)[4]) {
-  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
-  // from rotated_rect_intersection_pts
-  Point<T> intersectPts[24], orderedPts[24];
-  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
-  if (num <= 2) {
-    return 0.0;
-  }
-  // Convex Hull to order the intersection points in clockwise order and find
-  // the contour area.
-  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
-  return polygon_area<T>(orderedPts, num_convex);
-}
 }  // namespace
 template <typename T>
@@ -375,52 +341,3 @@ HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
  const T iou = intersection / baseS;
  return iou;
 }
-template <typename T>
-HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,
-                                           T const* const pts2_raw,
-                                           const int mode_flag) {
-  // shift center to the middle point to achieve higher precision in result
-  Point<T> pts1[4], pts2[4];
-  auto center_shift_x =
-      (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +
-       pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /
-      8.0;
-  auto center_shift_y =
-      (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +
-       pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /
-      8.0;
-  pts1[0].x = pts1_raw[0] - center_shift_x;
-  pts1[0].y = pts1_raw[1] - center_shift_y;
-  pts1[1].x = pts1_raw[2] - center_shift_x;
-  pts1[1].y = pts1_raw[3] - center_shift_y;
-  pts1[2].x = pts1_raw[4] - center_shift_x;
-  pts1[2].y = pts1_raw[5] - center_shift_y;
-  pts1[3].x = pts1_raw[6] - center_shift_x;
-  pts1[3].y = pts1_raw[7] - center_shift_y;
-  pts2[0].x = pts2_raw[0] - center_shift_x;
-  pts2[0].y = pts2_raw[1] - center_shift_y;
-  pts2[1].x = pts2_raw[2] - center_shift_x;
-  pts2[1].y = pts2_raw[3] - center_shift_y;
-  pts2[2].x = pts2_raw[4] - center_shift_x;
-  pts2[2].y = pts2_raw[5] - center_shift_y;
-  pts2[3].x = pts2_raw[6] - center_shift_x;
-  pts2[3].y = pts2_raw[7] - center_shift_y;
-  const T area1 = quadri_box_area<T>(pts1);
-  const T area2 = quadri_box_area<T>(pts2);
-  if (area1 < 1e-14 || area2 < 1e-14) {
-    return 0.f;
-  }
-  const T intersection = quadri_boxes_intersection<T>(pts1, pts2);
-  T baseS = 1.0;
-  if (mode_flag == 0) {
-    baseS = (area1 + area2 - intersection);
-  } else if (mode_flag == 1) {
-    baseS = area1;
-  }
-  const T iou = intersection / baseS;
-  return iou;
-}
--- a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
-#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
-#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-template <typename scalar_t>
-__global__ void active_rotated_filter_forward_cuda_kernel(
-    const int nthreads, const scalar_t* weight_data, const int* indices_data,
-    const int num_input_planes, const int num_output_planes,
-    const int num_orientations, const int num_rotations, const int nEntry,
-    scalar_t* output_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int l = index % nEntry;
-    int j = (index / nEntry) % num_input_planes;
-    int i = index / nEntry / num_input_planes;
-    int k;
-    scalar_t val = *(weight_data + index);
-    for (k = 0; k < num_rotations; k++) {
-      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
-      scalar_t* target = output_data +
-                         i * (num_rotations * num_input_planes * nEntry) +
-                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
-      *target = val;
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void active_rotated_filter_backward_cuda_kernel(
-    const int nthreads, const scalar_t* gradWeight_data,
-    const int* indices_data, const int num_input_planes,
-    const int num_output_planes, const int num_orientations,
-    const int num_rotations, const int nEntry, scalar_t* weight_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int l = index % nEntry;
-    int j = (index / nEntry) % num_input_planes;
-    int i = index / nEntry / num_input_planes;
-    int k;
-    scalar_t* val = weight_data + index;
-    *val = 0;
-    scalar_t tmp = 0;
-    for (k = 0; k < num_rotations; k++) {
-      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
-      scalar_t target =
-          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
-            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
-      tmp = tmp + target;
-    }
-    *val = tmp;
-  }
-}
-#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel(
    const int O, const int aggregate, const T* points, const T* centers,
    const T* scores, const int64_t* knn_idx, T* output) {
  // ----- parallel loop for B, N1, K and O ---------
-  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
-    // ------- loop for M ----------
+  if (i >= B * N1 * K * O) return;
-    const int b = (int)(i / (O * N1 * K));
+  // ------- loop for M ----------
-    const int o = (int)(i % (O * N1 * K) / (N1 * K));
+  const int b = (int)(i / (O * N1 * K));
-    const int n = (int)(i % (N1 * K) / K);
+  const int o = (int)(i % (O * N1 * K) / (N1 * K));
-    const int k = (int)(i % K);
+  const int n = (int)(i % (N1 * K) / K);
-    const int cn = (int)knn_idx[b * K * N1 + n * K +
+  const int k = (int)(i % K);
-                                0];  // The first neighbor is the center point
+  const int cn = (int)knn_idx[b * K * N1 + n * K +
-    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+                              0];  // The first neighbor is the center point
-    if (kn >= N0 ||
+  const int kn = (int)knn_idx[b * K * N1 + n * K + k];
-        kn < 0) {  // if index overflows, it is out of the neighborhood range
+  if (kn >= N0 ||
-      return;
+      kn < 0) {  // if index overflows, it is out of the neighborhood range
-    }
+    return;
-    assert(b < B);
+  }
-    assert(kn < N0);
+  assert(b < B);
-    assert(cn < N0);
+  assert(kn < N0);
-    assert(o < O);
+  assert(cn < N0);
-    assert(n < N1);
+  assert(o < O);
-    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+  assert(n < N1);
-    T val = output[out_idx];
+  const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
-    for (int m = 0; m < M; m++) {
+  T val = output[out_idx];
-      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+  for (int m = 0; m < M; m++) {
-                 scores[b * N1 * K * M + n * K * M + k * M + m] -
+    val += points[b * N0 * M * O + kn * M * O + m * O + o] *
-             centers[b * N0 * M * O + cn * M * O + m * O + o] *
+               scores[b * N1 * K * M + n * K * M + k * M + m] -
-                 scores[b * N1 * K * M + n * K * M + k * M + m];
+           centers[b * N0 * M * O + cn * M * O + m * O + o] *
-    }
+               scores[b * N1 * K * M + n * K * M + k * M + m];
-    output[out_idx] = val;
  }
+  output[out_idx] = val;
 }
 template <typename T>
@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
    const int O, const int aggregate, const T* grad_out, const T* scores,
    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
  // ----- parallel loop for B, M, O ---------
-  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
-    int b = (int)(i / (M * O));
+  if (i >= B * M * O) return;
-    int m = (int)(i % (M * O) / O);
+  int b = (int)(i / (M * O));
-    int o = (int)(i % O);
+  int m = (int)(i % (M * O) / O);
+  int o = (int)(i % O);
-    // ----- loop for N,K ---------
+  // ----- loop for N,K ---------
-    for (int n = 0; n < N; n++) {
+  for (int n = 0; n < N; n++) {
-      for (int k = 0; k < K; k++) {
+    for (int k = 0; k < K; k++) {
-        int kn = knn_idx[b * N * K + n * K + k];
+      int kn = knn_idx[b * N * K + n * K + k];
-        int cn = knn_idx[b * N * K + n * K + 0];
+      int cn = knn_idx[b * N * K + n * K + 0];
-        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
+      if (kn >= N0 ||
-                                   // neighborhood range
+          kn < 0) {  // if index overflows, it is out of the neighborhood range
-          continue;
+        continue;
-        }
-        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
-                  scores[b * N * K * M + n * K * M + k * M + m] *
-                      grad_out[b * O * N * K + o * N * K + n * K + k]);
-        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-                  -scores[b * N * K * M + n * K * M + k * M + m] *
-                      grad_out[b * O * N * K + o * N * K + n * K + k]);
      }
+      atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                scores[b * N * K * M + n * K * M + k * M + m] *
+                    grad_out[b * O * N * K + o * N * K + n * K + k]);
+      atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                -scores[b * N * K * M + n * K * M + k * M + m] *
+                    grad_out[b * O * N * K + o * N * K + n * K + k]);
    }
  }
 }
@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
    const int O, const int aggregate, const T* grad_out, const T* points,
    const T* centers, const int64_t* knn_idx, T* grad_scores) {
  // ----- parallel loop for B, N, K, M ---------
-  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
-    const int b = (int)(i / (N * M * K));
+  if (i >= B * N * K * M) return;
-    const int n = (int)(i % (N * M * K) / M / K);
+  const int b = (int)(i / (N * M * K));
-    const int k = (int)(i % (M * K) / M);
+  const int n = (int)(i % (N * M * K) / M / K);
-    const int m = (int)(i % M);
+  const int k = (int)(i % (M * K) / M);
-    const int cn = knn_idx[b * N * K + n * K + 0];
+  const int m = (int)(i % M);
-    const int kn = knn_idx[b * N * K + n * K + k];
+  const int cn = knn_idx[b * N * K + n * K + 0];
-    if (kn >= N0 ||
+  const int kn = knn_idx[b * N * K + n * K + k];
-        kn < 0) {  // if index overflows, it is out of the neighborhood range
+  if (kn >= N0 ||
-      return;
+      kn < 0) {  // if index overflows, it is out of the neighborhood range
-    }
+    return;
+  }
-    // -------------- loop for O ------------------------
+  // -------------- loop for O ------------------------
-    const int out_idx = b * N * K * M + n * K * M + k * M + m;
+  const int out_idx = b * N * K * M + n * K * M + k * M + m;
-    T val = grad_scores[out_idx];
+  T val = grad_scores[out_idx];
-    for (int o = 0; o < O; o++) {
+  for (int o = 0; o < O; o++) {
-      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+    val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
-              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+            centers[b * N0 * M * O + cn * M * O + m * O + o]) *
-             grad_out[b * O * N * K + o * N * K + n * K + k];
+           grad_out[b * O * N * K + o * N * K + n * K + k];
-    }
-    grad_scores[out_idx] = val;
  }
+  grad_scores[out_idx] = val;
 }
 #endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -21,36 +21,35 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
  // output:
  //      idx: (B, M, nsample)
  int bs_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b) return;
+  if (bs_idx >= b || pt_idx >= m) return;
-    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+  new_xyz += bs_idx * m * 3 + pt_idx * 3;
-    xyz += bs_idx * n * 3;
+  xyz += bs_idx * n * 3;
-    idx += bs_idx * m * nsample + pt_idx * nsample;
+  idx += bs_idx * m * nsample + pt_idx * nsample;
-    float max_radius2 = max_radius * max_radius;
+  float max_radius2 = max_radius * max_radius;
-    float min_radius2 = min_radius * min_radius;
+  float min_radius2 = min_radius * min_radius;
-    T new_x = new_xyz[0];
+  T new_x = new_xyz[0];
-    T new_y = new_xyz[1];
+  T new_y = new_xyz[1];
-    T new_z = new_xyz[2];
+  T new_z = new_xyz[2];
-    int cnt = 0;
+  int cnt = 0;
-    for (int k = 0; k < n; ++k) {
+  for (int k = 0; k < n; ++k) {
-      T x = xyz[k * 3 + 0];
+    T x = xyz[k * 3 + 0];
-      T y = xyz[k * 3 + 1];
+    T y = xyz[k * 3 + 1];
-      T z = xyz[k * 3 + 2];
+    T z = xyz[k * 3 + 2];
-      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-             (new_z - z) * (new_z - z);
+           (new_z - z) * (new_z - z);
-      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+    if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
-        if (cnt == 0) {
+      if (cnt == 0) {
-          for (int l = 0; l < nsample; ++l) {
+        for (int l = 0; l < nsample; ++l) {
-            idx[l] = k;
+          idx[l] = k;
-          }
        }
-        idx[cnt] = k;
-        ++cnt;
-        if (cnt >= nsample) break;
      }
+      idx[cnt] = k;
+      ++cnt;
+      if (cnt >= nsample) break;
    }
  }
 }

--- a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -8,27 +8,6 @@
 #include "pytorch_cuda_helper.hpp"
 #endif
-template <typename T>
-__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
-                                          T& y1, T& x2, T& y2) {
-  x1 = bbox[base];
-  y1 = bbox[base + 1];
-  x2 = bbox[base + 2];
-  y2 = bbox[base + 3];
-}
-template <>
-__device__ __forceinline__ void load_bbox<float>(const float* bbox,
-                                                 const int base, float& x1,
-                                                 float& y1, float& x2,
-                                                 float& y2) {
-  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
-  x1 = bbox_offset.x;
-  y1 = bbox_offset.y;
-  x2 = bbox_offset.z;
-  y2 = bbox_offset.w;
-}
 template <typename T>
 __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                          T* ious, const int num_bbox1,
@@ -37,111 +16,69 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                          const int offset) {
  if (aligned) {
    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
-      const int b1 = index;
+      int b1 = index;
-      const int b2 = index;
+      int b2 = index;
-      const int base1 = b1 << 2;  // b1 * 4
+      int base1 = b1 * 4;
-      T b1_x1, b1_y1, b1_x2, b1_y2;
+      T b1_x1 = bbox1[base1];
-      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      T b1_y1 = bbox1[base1 + 1];
-      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
-      const int base2 = b2 << 2;  // b2 * 4
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
-      T b2_x1, b2_y1, b2_x2, b2_y2;
-      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      int base2 = b2 * 4;
-      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
-      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T b2_x2 = bbox2[base2 + 2];
-      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T b2_y2 = bbox2[base2 + 3];
-      const T width = fmaxf(right - left + offset, 0.f);
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
-      const T height = fmaxf(bottom - top + offset, 0.f);
-      const T interS = width * height;
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
-      const T baseS =
+      T width = fmaxf(right - left + offset, 0.f);
-          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
      ious[index] = interS / baseS;
    }
  } else {
    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
-      const int b1 = index / num_bbox2;
+      int b1 = index / num_bbox2;
-      const int b2 = index % num_bbox2;
+      int b2 = index % num_bbox2;
-      const int base1 = b1 << 2;  // b1 * 4
+      int base1 = b1 * 4;
-      T b1_x1, b1_y1, b1_x2, b1_y2;
+      T b1_x1 = bbox1[base1];
-      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      T b1_y1 = bbox1[base1 + 1];
-      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
-      const int base2 = b2 << 2;  // b2 * 4
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
-      T b2_x1, b2_y1, b2_x2, b2_y2;
-      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      int base2 = b2 * 4;
-      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
-      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T b2_x2 = bbox2[base2 + 2];
-      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T b2_y2 = bbox2[base2 + 3];
-      const T width = fmaxf(right - left + offset, 0.f);
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
-      const T height = fmaxf(bottom - top + offset, 0.f);
-      const T interS = width * height;
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
-      const T baseS =
+      T width = fmaxf(right - left + offset, 0.f);
-          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
      ious[index] = interS / baseS;
    }
  }
 }
-#if __CUDA_ARCH__ >= 530
-__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
-                                              const __half x2, const __half y2,
-                                              const __half offset) {
-  const __half half_w = __hadd(__hsub(x2, x1), offset);
-  const __half half_h = __hadd(__hsub(y2, y1), offset);
-  return __hmul(half_w, half_h);
-}
-__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
-  return __hge(a, b) ? a : b;
-}
-__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
-  return __hle(a, b) ? a : b;
-}
-// fp16 won't provide much increase when aligned==true. It is useful when
-// aligned==false, which would give you ~40% bonus.
-__device__ void bbox_overlaps_cuda_kernel_half(
-    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
-    const int num_bbox2, const int mode, const bool aligned, const int offset) {
-  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
-  const __half h_offset = __int2half_rn(offset);
-  CUDA_1D_KERNEL_LOOP(index, num_output) {
-    const int b1 = aligned ? index : index / num_bbox2;
-    const int b2 = aligned ? index : index % num_bbox2;
-    const int base1 = b1 << 2;
-    __half b1_x1, b1_y1, b1_x2, b1_y2;
-    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
-    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
-    const int base2 = b2 << 2;
-    __half b2_x1, b2_y1, b2_x2, b2_y2;
-    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
-    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
-    const __half left = __half_max(b1_x1, b2_x1),
-                 right = __half_min(b1_x2, b2_x2);
-    const __half top = __half_max(b1_y1, b2_y1),
-                 bottom = __half_min(b1_y2, b2_y2);
-    const __half width =
-        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
-    const __half height =
-        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
-    const __half interS = __hmul(width, height);
-    const __half baseS = __half_max(
-        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
-        h_offset);
-    ious[index] = __hdiv(interS, baseS);
-  }
-}
-#endif  // __CUDA_ARCH__ >= 530
 #endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
-#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH
-#define BEZIER_ALIGN_CUDA_KERNEL_CUH
-#include <float.h>
-#ifdef MMCV_WITH_TRT
-#include "common_cuda_helper.hpp"
-#else  // MMCV_WITH_TRT
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else  // MMCV_USE_PARROTS
-#include "pytorch_cuda_helper.hpp"
-#endif  // MMCV_USE_PARROTS
-#endif  // MMCV_WITH_TRT
-template <typename T>
-__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
-                          const T u) {
-  return ((1. - u) * (1. - u) * (1. - u) * p0 +
-          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
-          u * u * u * p3);
-}
-template <typename T>
-__global__ void bezier_align_forward_cuda_kernel(
-    const int nthreads,
-    const T *bottom_data,  // inputs
-    const T *bottom_rois,  // bottom rois contains the bezier curve
-    T *top_data,           // outputs
-    const int pooled_height, const int pooled_width, const T spatial_scale,
-    const int sampling_ratio, bool aligned, const int channels,
-    const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-    // beziers have size Nx(1+8*2) = Nx17
-    const T *offset_bottom_rois = bottom_rois + n * 17;
-    int roi_batch_ind = offset_bottom_rois[0];
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    // TODO: avoid this by using parallel annotation, for good
-    T p0_x = offset_bottom_rois[1] * spatial_scale;
-    T p0_y = offset_bottom_rois[2] * spatial_scale;
-    T p1_x = offset_bottom_rois[3] * spatial_scale;
-    T p1_y = offset_bottom_rois[4] * spatial_scale;
-    T p2_x = offset_bottom_rois[5] * spatial_scale;
-    T p2_y = offset_bottom_rois[6] * spatial_scale;
-    T p3_x = offset_bottom_rois[7] * spatial_scale;
-    T p3_y = offset_bottom_rois[8] * spatial_scale;
-    T p4_x = offset_bottom_rois[15] * spatial_scale;
-    T p4_y = offset_bottom_rois[16] * spatial_scale;
-    T p5_x = offset_bottom_rois[13] * spatial_scale;
-    T p5_y = offset_bottom_rois[14] * spatial_scale;
-    T p6_x = offset_bottom_rois[11] * spatial_scale;
-    T p6_y = offset_bottom_rois[12] * spatial_scale;
-    T p7_x = offset_bottom_rois[9] * spatial_scale;
-    T p7_y = offset_bottom_rois[10] * spatial_scale;
-    // compute the coords
-    const T u = pw / static_cast<T>(pooled_width);
-    const T v = ph / static_cast<T>(pooled_height);
-    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
-    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
-    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
-    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
-    const T x_center = x1 * v + x0 * (1. - v) - offset;
-    const T y_center = y1 * v + y0 * (1. - v) - offset;
-    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
-    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-    const T *offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels + c) * height * width;
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    // We do average (integral) pooling inside a bin
-    // When the grid is empty, output zeros == 0/1, instead of NaN.
-    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-    T output_val = 0.;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
-    {
-      const T y = y_center - (T)0.5 * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = x_center - (T)0.5 * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
-                                     index);
-        output_val += val;
-      }
-    }
-    output_val /= count;
-    top_data[index] = output_val;
-  }
-}
-template <typename T>
-__global__ void bezier_align_backward_cuda_kernel(
-    const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
-    const int pooled_height, const int pooled_width, const T spatial_scale,
-    const int sampling_ratio, bool aligned, const int channels,
-    const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-    // beziers have size Nx(1+8*2) = Nx17
-    const T *offset_bottom_rois = bottom_rois + n * 17;
-    int roi_batch_ind = offset_bottom_rois[0];
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T p0_x = offset_bottom_rois[1] * spatial_scale;
-    T p0_y = offset_bottom_rois[2] * spatial_scale;
-    T p1_x = offset_bottom_rois[3] * spatial_scale;
-    T p1_y = offset_bottom_rois[4] * spatial_scale;
-    T p2_x = offset_bottom_rois[5] * spatial_scale;
-    T p2_y = offset_bottom_rois[6] * spatial_scale;
-    T p3_x = offset_bottom_rois[7] * spatial_scale;
-    T p3_y = offset_bottom_rois[8] * spatial_scale;
-    T p4_x = offset_bottom_rois[15] * spatial_scale;
-    T p4_y = offset_bottom_rois[16] * spatial_scale;
-    T p5_x = offset_bottom_rois[13] * spatial_scale;
-    T p5_y = offset_bottom_rois[14] * spatial_scale;
-    T p6_x = offset_bottom_rois[11] * spatial_scale;
-    T p6_y = offset_bottom_rois[12] * spatial_scale;
-    T p7_x = offset_bottom_rois[9] * spatial_scale;
-    T p7_y = offset_bottom_rois[10] * spatial_scale;
-    // compute the coords
-    const T u = pw / static_cast<T>(pooled_width);
-    const T v = ph / static_cast<T>(pooled_height);
-    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
-    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
-    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
-    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
-    const T x_center = x1 * v + x0 * (1. - v) - offset;
-    const T y_center = y1 * v + y0 * (1. - v) - offset;
-    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
-    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-    T *offset_bottom_diff =
-        bottom_diff + (roi_batch_ind * channels + c) * height * width;
-    int top_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T *offset_top_diff = top_diff + top_offset;
-    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
-    {
-      const T y = y_center - (T)0.5 * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = x_center - (T)0.5 * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high, index);
-        T g1 = top_diff_this_bin * w1 / count;
-        T g2 = top_diff_this_bin * w2 / count;
-        T g3 = top_diff_this_bin * w3 / count;
-        T g4 = top_diff_this_bin * w4 / count;
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          atomicAdd(offset_bottom_diff + y_low * width + x_low,
-                    static_cast<T>(g1));
-          atomicAdd(offset_bottom_diff + y_low * width + x_high,
-                    static_cast<T>(g2));
-          atomicAdd(offset_bottom_diff + y_high * width + x_low,
-                    static_cast<T>(g3));
-          atomicAdd(offset_bottom_diff + y_high * width + x_high,
-                    static_cast<T>(g4));
-        }  // if
-      }    // ix
-    }      // iy
-  }        // CUDA_1D_KERNEL_LOOP
-}  // BezierAlignBackward
-#endif  // BEZIER_ALIGN_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
+++ b/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#ifndef BOX_IOU_QUADRI_CUDA_CUH
-#define BOX_IOU_QUADRI_CUDA_CUH
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-#include "box_iou_rotated_utils.hpp"
-// 2D block with 32 * 16 = 512 threads per block
-const int BLOCK_DIM_X = 32;
-const int BLOCK_DIM_Y = 16;
-inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
-template <typename T>
-__global__ void box_iou_quadri_cuda_kernel(
-    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
-    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
-  if (aligned) {
-    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
-      int b1 = index;
-      int b2 = index;
-      int base1 = b1 * 8;
-      float block_boxes1[8];
-      float block_boxes2[8];
-      block_boxes1[0] = dev_boxes1[base1 + 0];
-      block_boxes1[1] = dev_boxes1[base1 + 1];
-      block_boxes1[2] = dev_boxes1[base1 + 2];
-      block_boxes1[3] = dev_boxes1[base1 + 3];
-      block_boxes1[4] = dev_boxes1[base1 + 4];
-      block_boxes1[5] = dev_boxes1[base1 + 5];
-      block_boxes1[6] = dev_boxes1[base1 + 6];
-      block_boxes1[7] = dev_boxes1[base1 + 7];
-      int base2 = b2 * 8;
-      block_boxes2[0] = dev_boxes2[base2 + 0];
-      block_boxes2[1] = dev_boxes2[base2 + 1];
-      block_boxes2[2] = dev_boxes2[base2 + 2];
-      block_boxes2[3] = dev_boxes2[base2 + 3];
-      block_boxes2[4] = dev_boxes2[base2 + 4];
-      block_boxes2[5] = dev_boxes2[base2 + 5];
-      block_boxes2[6] = dev_boxes2[base2 + 6];
-      block_boxes2[7] = dev_boxes2[base2 + 7];
-      dev_ious[index] =
-          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
-    }
-  } else {
-    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
-      int b1 = index / n_boxes2;
-      int b2 = index % n_boxes2;
-      int base1 = b1 * 8;
-      float block_boxes1[8];
-      float block_boxes2[8];
-      block_boxes1[0] = dev_boxes1[base1 + 0];
-      block_boxes1[1] = dev_boxes1[base1 + 1];
-      block_boxes1[2] = dev_boxes1[base1 + 2];
-      block_boxes1[3] = dev_boxes1[base1 + 3];
-      block_boxes1[4] = dev_boxes1[base1 + 4];
-      block_boxes1[5] = dev_boxes1[base1 + 5];
-      block_boxes1[6] = dev_boxes1[base1 + 6];
-      block_boxes1[7] = dev_boxes1[base1 + 7];
-      int base2 = b2 * 8;
-      block_boxes2[0] = dev_boxes2[base2 + 0];
-      block_boxes2[1] = dev_boxes2[base2 + 1];
-      block_boxes2[2] = dev_boxes2[base2 + 2];
-      block_boxes2[3] = dev_boxes2[base2 + 3];
-      block_boxes2[4] = dev_boxes2[base2 + 4];
-      block_boxes2[5] = dev_boxes2[base2 + 5];
-      block_boxes2[6] = dev_boxes2[base2 + 6];
-      block_boxes2[7] = dev_boxes2[base2 + 7];
-      dev_ious[index] =
-          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
-    }
-  }
-}
-#endif
--- a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -8,7 +8,7 @@
 #include "pytorch_cuda_helper.hpp"
 #endif
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
 #define WARP_SIZE 64
 #else
 #define WARP_SIZE 32
@@ -29,22 +29,22 @@ __device__ inline int Loc2Index(const int n, const int c, const int h,
  int index = w + (h + (c + n * channel_num) * height) * width;
  return index;
 }
-#ifndef MMCV_WITH_HIP
+#ifndef HIP_DIFF
 /* TODO: move this to a common place */
 template <typename scalar_t>
-__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+__device__ inline scalar_t mmcv_min(scalar_t a, scalar_t b) {
  return a < b ? a : b;
 }
 template <typename scalar_t>
-__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+__device__ inline scalar_t mmcv_max(scalar_t a, scalar_t b) {
  return a > b ? a : b;
 }
 #endif
 template <typename scalar_t>
 __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
    val += __shfl_down(val, offset);
 #else
    val += __shfl_down_sync(FULL_MASK, val, offset);
@@ -55,11 +55,11 @@ __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
 template <>
 __device__ __forceinline__ phalf warpReduceSum(phalf val) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
-    __PHALF(val) += __shfl_down(val, offset);
+    __PHALF(val) += __shfl_down(FULL_MASK, val, offset);
 #else
    __PHALF(val) +=
-        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
+        __shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
 #endif
  return val;
 }
@@ -316,7 +316,7 @@ __global__ void CARAFEBackward_Mask(const int num_kernels,
      output_val += top_diff[top_id] * bottom_data[bottom_id];
    }
  }
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
  __syncthreads();
 #else
  __syncwarp();