release v1.6.1 of mmcv

fdeee889 · limm · df465820 · fdeee889 · fdeee889 · fdeee889
Commit fdeee889 authored May 25, 2025 by limm
20 changed files
--- a/mmcv/ops/__init__.py
+++ b/mmcv/ops/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from .active_rotated_filter import active_rotated_filter
 from .assign_score_withk import assign_score_withk
 from .ball_query import ball_query
 from .bbox import bbox_overlaps
@@ -6,7 +7,9 @@ from .border_align import BorderAlign, border_align
 from .box_iou_rotated import box_iou_rotated
 from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
 from .cc_attention import CrissCrossAttention
+from .chamfer_distance import chamfer_distance
 from .contour_expand import contour_expand
+from .convex_iou import convex_giou, convex_iou
 from .corner_pool import CornerPool
 from .correlation import Correlation
 from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
@@ -16,6 +19,7 @@ from .deprecated_wrappers import Conv2d_deprecated as Conv2d
 from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
 from .deprecated_wrappers import Linear_deprecated as Linear
 from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
+from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
 from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
                         sigmoid_focal_loss, softmax_focal_loss)
 from .furthest_point_sample import (furthest_point_sample,
@@ -25,9 +29,11 @@ from .gather_points import gather_points
 from .group_points import GroupAll, QueryAndGroup, grouping_operation
 from .info import (get_compiler_version, get_compiling_cuda_version,
                   get_onnxruntime_op_path)
-from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev
+from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
+                    nms3d_normal, nms_bev, nms_normal_bev)
 from .knn import knn
 from .masked_conv import MaskedConv2d, masked_conv2d
+from .min_area_polygons import min_area_polygons
 from .modulated_deform_conv import (ModulatedDeformConv2d,
                                    ModulatedDeformConv2dPack,
                                    modulated_deform_conv2d)
@@ -38,15 +44,25 @@ from .point_sample import (SimpleRoIAlign, point_sample,
                           rel_roi_point_to_rel_img_point)
 from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
                              points_in_boxes_part)
+from .points_in_polygons import points_in_polygons
 from .points_sampler import PointsSampler
+from .prroi_pool import PrRoIPool, prroi_pool
 from .psa_mask import PSAMask
+from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
 from .roi_align import RoIAlign, roi_align
 from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
 from .roi_pool import RoIPool, roi_pool
 from .roiaware_pool3d import RoIAwarePool3d
 from .roipoint_pool3d import RoIPointPool3d
+from .rotated_feature_align import rotated_feature_align
 from .saconv import SAConv2d
 from .scatter_points import DynamicScatter, dynamic_scatter
+from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                          SparseConvTranspose3d, SparseInverseConv2d,
+                          SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from .sparse_modules import SparseModule, SparseSequential
+from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
+from .sparse_structure import SparseConvTensor, scatter_nd
 from .sync_bn import SyncBatchNorm
 from .three_interpolate import three_interpolate
 from .three_nn import three_nn
@@ -70,12 +86,21 @@ __all__ = [
    'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
    'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
    'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
+    'rotated_feature_align', 'RiRoIAlignRotated', 'riroi_align_rotated',
    'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
    'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
    'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
    'border_align', 'gather_points', 'furthest_point_sample',
    'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
-    'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization',
-    'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d',
-    'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all'
+    'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
+    'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
+    'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
+    'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
+    'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
+    'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
+    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
+    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
+    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
+    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
+    'PrRoIPool', 'prroi_pool'
 ]
--- a/mmcv/ops/active_rotated_filter.py
+++ b/mmcv/ops/active_rotated_filter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])
+
+
+class ActiveRotatedFilterFunction(Function):
+    """Encoding the orientation information and generating orientation-
+    sensitive features.
+
+    The details are described in the paper `Align Deep Features for Oriented
+    Object Detection  <https://arxiv.org/abs/2008.09397>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): Input features with shape
+                [num_output_planes, num_input_planes, num_orientations, H, W].
+            indices (torch.Tensor): Indices with shape
+                [num_orientations, H, W, num_rotations].
+
+        Returns:
+            torch.Tensor: Refined features with shape [num_output_planes *
+            num_rotations, num_input_planes * num_orientations, H, W].
+        """
+        ctx.save_for_backward(input, indices)
+        op, ip, o, h, w = input.size()
+        o, h, w, r = indices.size()
+        output = input.new_zeros((op * r, ip * o, h, w))
+        ext_module.active_rotated_filter_forward(input, indices, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradiant of output features
+                with shape [num_output_planes * num_rotations,
+                num_input_planes * num_orientations, H, W].
+
+        Returns:
+            torch.Tensor: The gradiant of input features with shape
+            [num_output_planes, num_input_planes, num_orientations, H, W].
+        """
+        input, indices = ctx.saved_tensors
+        grad_in = torch.zeros_like(input)
+        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
+        return grad_in, None
+
+
+active_rotated_filter = ActiveRotatedFilterFunction.apply
--- a/mmcv/ops/assign_score_withk.py
+++ b/mmcv/ops/assign_score_withk.py
+from typing import Tuple
+
+import torch
 from torch.autograd import Function

 from ..utils import ext_loader
@@ -27,11 +30,11 @@ class AssignScoreWithK(Function):

    @staticmethod
    def forward(ctx,
-                scores,
-                point_features,
-                center_features,
-                knn_idx,
-                aggregate='sum'):
+                scores: torch.Tensor,
+                point_features: torch.Tensor,
+                center_features: torch.Tensor,
+                knn_idx: torch.Tensor,
+                aggregate: str = 'sum') -> torch.Tensor:
        """
        Args:
            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
@@ -78,15 +81,20 @@ class AssignScoreWithK(Function):
        return output

    @staticmethod
-    def backward(ctx, grad_out):
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
        """
        Args:
            grad_out (torch.Tensor): (B, out_dim, npoint, K)

        Returns:
-            grad_scores (torch.Tensor): (B, npoint, K, M)
-            grad_point_features (torch.Tensor): (B, N, M, out_dim)
-            grad_center_features (torch.Tensor): (B, N, M, out_dim)
+            tuple[torch.Tensor]: A tuple contains five elements. The first one
+            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
+            second is the gradient of ``point_features`` whose shape is
+            (B, N, M, out_dim). The third is the gradient of
+            ``center_features`` with the shape of (B, N, M, out_dim). The last
+            two are ``None``.
        """
        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors


--- a/mmcv/ops/ball_query.py
+++ b/mmcv/ops/ball_query.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
 from torch.autograd import Function

@@ -18,12 +20,13 @@ class BallQuery(Function):
            min_radius (float): minimum radius of the balls.
            max_radius (float): maximum radius of the balls.
            sample_num (int): maximum number of features in the balls.
-            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
+            xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features.
+            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
+                query.

        Returns:
-            Tensor: (B, npoint, nsample) tensor with the indices of
-                the features that form the query balls.
+            torch.Tensor: (B, npoint, nsample) tensor with the indices of the
+            features that form the query balls.
        """
        assert center_xyz.is_contiguous()
        assert xyz.is_contiguous()
@@ -48,7 +51,7 @@ class BallQuery(Function):
        return idx

    @staticmethod
-    def backward(ctx, a=None):
+    def backward(ctx, a=None) -> Tuple[None, None, None, None]:
        return None, None, None, None



--- a/mmcv/ops/bbox.py
+++ b/mmcv/ops/bbox.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
 from ..utils import ext_loader

 ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])


-def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
+def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
+                       bboxes2: torch.Tensor,
+                       mode: str = 'iou',
+                       aligned: bool = False,
+                       offset: int = 0) -> torch.Tensor:
+    assert mode in ['iou', 'iof']
+
+    if aligned:
+        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
+        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1 + area2 - overlap)
+        else:
+            ious = overlap / area1
+    else:
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1[:, None] + area2 - overlap)
+        else:
+            ious = overlap / (area1[:, None])
+
+    return ious
+
+
+def bbox_overlaps(bboxes1: torch.Tensor,
+                  bboxes2: torch.Tensor,
+                  mode: str = 'iou',
+                  aligned: bool = False,
+                  offset: int = 0) -> torch.Tensor:
    """Calculate overlap between two set of bboxes.

    If ``aligned`` is ``False``, then calculate the ious between each bbox
@@ -12,14 +59,16 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
    bboxes1 and bboxes2.

    Args:
-        bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
-        bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
-            If aligned is ``True``, then m and n must be equal.
+        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
+            empty.
+        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
+            empty. If aligned is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union) or iof (intersection over
            foreground).

    Returns:
-        ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (m, n) else (m, 1).

    Example:
        >>> bboxes1 = torch.FloatTensor([
@@ -63,10 +112,19 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
    if rows * cols == 0:
        return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)

-    if aligned:
-        ious = bboxes1.new_zeros(rows)
+    if bboxes1.device.type == 'cpu':
+        return _bbox_overlaps_cpu(
+            bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
    else:
-        ious = bboxes1.new_zeros((rows, cols))
-    ext_module.bbox_overlaps(
-        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
-    return ious
+        if aligned:
+            ious = bboxes1.new_zeros(rows)
+        else:
+            ious = bboxes1.new_zeros((rows, cols))
+        ext_module.bbox_overlaps(
+            bboxes1,
+            bboxes2,
+            ious,
+            mode=mode_flag,
+            aligned=aligned,
+            offset=offset)
+        return ious
--- a/mmcv/ops/border_align.py
+++ b/mmcv/ops/border_align.py
@@ -2,6 +2,8 @@
 # modified from
 # https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py

+from typing import Tuple
+
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -21,7 +23,8 @@ class BorderAlignFunction(Function):
            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)

    @staticmethod
-    def forward(ctx, input, boxes, pool_size):
+    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
+                pool_size: int) -> torch.Tensor:
        ctx.pool_size = pool_size
        ctx.input_shape = input.size()

@@ -45,7 +48,8 @@ class BorderAlignFunction(Function):

    @staticmethod
    @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(ctx,
+                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
        boxes, argmax_idx = ctx.saved_tensors
        grad_input = grad_output.new_zeros(ctx.input_shape)
        # complex head architecture may cause grad_output uncontiguous
@@ -72,24 +76,25 @@ class BorderAlign(nn.Module):

    For each border line (e.g. top, left, bottom or right) of each box,
    border_align does the following:
-        1. uniformly samples `pool_size`+1 positions on this line, involving \
-           the start and end points.
-        2. the corresponding features on these points are computed by \
-           bilinear interpolation.
-        3. max pooling over all the `pool_size`+1 positions are used for \
-           computing pooled feature.
+
+    1. uniformly samples ``pool_size`` +1 positions on this line, involving
+       the start and end points.
+    2. the corresponding features on these points are computed by bilinear
+       interpolation.
+    3. max pooling over all the ``pool_size`` +1 positions are used for
+       computing pooled feature.

    Args:
        pool_size (int): number of positions sampled over the boxes' borders
            (e.g. top, bottom, left, right).
-
    """

-    def __init__(self, pool_size):
-        super(BorderAlign, self).__init__()
+    def __init__(self, pool_size: int):
+        super().__init__()
        self.pool_size = pool_size

-    def forward(self, input, boxes):
+    def forward(self, input: torch.Tensor,
+                boxes: torch.Tensor) -> torch.Tensor:
        """
        Args:
            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
@@ -98,8 +103,8 @@ class BorderAlign(nn.Module):
            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).

        Returns:
-            Tensor: Pooled features with shape [N,C,H*W,4]. The order is
-                (top,left,bottom,right) for the last dimension.
+            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+            (top,left,bottom,right) for the last dimension.
        """
        return border_align(input, boxes, self.pool_size)


--- a/mmcv/ops/box_iou_rotated.py
+++ b/mmcv/ops/box_iou_rotated.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
 from ..utils import ext_loader

 ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])


-def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
+def box_iou_rotated(bboxes1: torch.Tensor,
+                    bboxes2: torch.Tensor,
+                    mode: str = 'iou',
+                    aligned: bool = False,
+                    clockwise: bool = True) -> torch.Tensor:
    """Return intersection-over-union (Jaccard index) of boxes.

    Both sets of boxes are expected to be in
@@ -14,18 +20,110 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
    bboxes1 and bboxes2.

-    Arguments:
-        boxes1 (Tensor): rotated bboxes 1. \
-            It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
-            Note that theta is in radian.
-        boxes2 (Tensor): rotated bboxes 2. \
-            It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
-            Note that theta is in radian.
+    .. note::
+        The operator assumes:
+
+        1) The positive direction along x axis is left -> right.
+
+        2) The positive direction along y axis is top -> down.
+
+        3) The w border is in parallel with x axis when angle = 0.
+
+        However, there are 2 opposite definitions of the positive angular
+        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
+        both definitions and uses CW by default.
+
+        Please set ``clockwise=False`` if you are using the CCW definition.
+
+        The coordinate system when ``clockwise`` is ``True`` (default)
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
+                \\\\
+                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+
+        The coordinate system when ``clockwise`` is ``False``
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (-pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
+                \\\\
+                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+    Args:
+        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
        mode (str): "iou" (intersection over union) or iof (intersection over
            foreground).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`

    Returns:
-        ious(Tensor): shape (N, M) if aligned == False else shape (N,)
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (N, M) else (N,).
    """
    assert mode in ['iou', 'iof']
    mode_dict = {'iou': 0, 'iof': 1}
@@ -35,7 +133,12 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
    if aligned:
        ious = bboxes1.new_zeros(rows)
    else:
-        ious = bboxes1.new_zeros((rows * cols))
+        ious = bboxes1.new_zeros(rows * cols)
+    if not clockwise:
+        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        flip_mat[-1] = -1
+        bboxes1 = bboxes1 * flip_mat
+        bboxes2 = bboxes2 * flip_mat
    bboxes1 = bboxes1.contiguous()
    bboxes2 = bboxes2.contiguous()
    ext_module.box_iou_rotated(

--- a/mmcv/ops/carafe.py
+++ b/mmcv/ops/carafe.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import Tensor
 from torch.autograd import Function
 from torch.nn.modules.module import Module

@@ -17,7 +20,8 @@ ext_module = ext_loader.load_ext('_ext', [
 class CARAFENaiveFunction(Function):

    @staticmethod
-    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
        return g.op(
            'mmcv::MMCVCARAFENaive',
            features,
@@ -27,7 +31,8 @@ class CARAFENaiveFunction(Function):
            scale_factor_f=scale_factor)

    @staticmethod
-    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
        assert scale_factor >= 1
        assert masks.size(1) == kernel_size * kernel_size * group_size
        assert masks.size(-1) == features.size(-1) * scale_factor
@@ -50,12 +55,15 @@ class CARAFENaiveFunction(Function):
            group_size=group_size,
            scale_factor=scale_factor)

-        if features.requires_grad or masks.requires_grad:
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
            ctx.save_for_backward(features, masks)
        return output

    @staticmethod
-    def backward(ctx, grad_output):
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
        assert grad_output.is_cuda

        features, masks = ctx.saved_tensors
@@ -83,8 +91,8 @@ carafe_naive = CARAFENaiveFunction.apply

 class CARAFENaive(Module):

-    def __init__(self, kernel_size, group_size, scale_factor):
-        super(CARAFENaive, self).__init__()
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()

        assert isinstance(kernel_size, int) and isinstance(
            group_size, int) and isinstance(scale_factor, int)
@@ -92,7 +100,7 @@ class CARAFENaive(Module):
        self.group_size = group_size
        self.scale_factor = scale_factor

-    def forward(self, features, masks):
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
        return carafe_naive(features, masks, self.kernel_size, self.group_size,
                            self.scale_factor)

@@ -100,7 +108,8 @@ class CARAFENaive(Module):
 class CARAFEFunction(Function):

    @staticmethod
-    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
        return g.op(
            'mmcv::MMCVCARAFE',
            features,
@@ -110,7 +119,8 @@ class CARAFEFunction(Function):
            scale_factor_f=scale_factor)

    @staticmethod
-    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
        assert scale_factor >= 1
        assert masks.size(1) == kernel_size * kernel_size * group_size
        assert masks.size(-1) == features.size(-1) * scale_factor
@@ -139,12 +149,15 @@ class CARAFEFunction(Function):
            group_size=group_size,
            scale_factor=scale_factor)

-        if features.requires_grad or masks.requires_grad:
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
            ctx.save_for_backward(features, masks, rfeatures)
        return output

    @staticmethod
-    def backward(ctx, grad_output):
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
        assert grad_output.is_cuda

        features, masks, rfeatures = ctx.saved_tensors
@@ -180,7 +193,8 @@ carafe = CARAFEFunction.apply
 class CARAFE(Module):
    """ CARAFE: Content-Aware ReAssembly of FEatures

-    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_ for more details.

    Args:
        kernel_size (int): reassemble kernel size
@@ -191,8 +205,8 @@ class CARAFE(Module):
        upsampled feature map
    """

-    def __init__(self, kernel_size, group_size, scale_factor):
-        super(CARAFE, self).__init__()
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()

        assert isinstance(kernel_size, int) and isinstance(
            group_size, int) and isinstance(scale_factor, int)
@@ -200,7 +214,7 @@ class CARAFE(Module):
        self.group_size = group_size
        self.scale_factor = scale_factor

-    def forward(self, features, masks):
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
        return carafe(features, masks, self.kernel_size, self.group_size,
                      self.scale_factor)

@@ -211,8 +225,8 @@ class CARAFEPack(nn.Module):
    compressor 2) content encoder 3) CARAFE op.

    Official implementation of ICCV 2019 paper
-    CARAFE: Content-Aware ReAssembly of FEatures
-    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+    `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_.

    Args:
        channels (int): input feature channels
@@ -228,14 +242,14 @@ class CARAFEPack(nn.Module):
    """

    def __init__(self,
-                 channels,
-                 scale_factor,
-                 up_kernel=5,
-                 up_group=1,
-                 encoder_kernel=3,
-                 encoder_dilation=1,
-                 compressed_channels=64):
-        super(CARAFEPack, self).__init__()
+                 channels: int,
+                 scale_factor: int,
+                 up_kernel: int = 5,
+                 up_group: int = 1,
+                 encoder_kernel: int = 3,
+                 encoder_dilation: int = 1,
+                 compressed_channels: int = 64):
+        super().__init__()
        self.channels = channels
        self.scale_factor = scale_factor
        self.up_kernel = up_kernel
@@ -261,7 +275,7 @@ class CARAFEPack(nn.Module):
                xavier_init(m, distribution='uniform')
        normal_init(self.content_encoder, std=0.001)

-    def kernel_normalizer(self, mask):
+    def kernel_normalizer(self, mask: Tensor) -> Tensor:
        mask = F.pixel_shuffle(mask, self.scale_factor)
        n, mask_c, h, w = mask.size()
        # use float division explicitly,
@@ -274,11 +288,11 @@ class CARAFEPack(nn.Module):

        return mask

-    def feature_reassemble(self, x, mask):
+    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
        return x

-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
        compressed_x = self.channel_compressor(x)
        mask = self.content_encoder(compressed_x)
        mask = self.kernel_normalizer(mask)

--- a/mmcv/ops/cc_attention.py
+++ b/mmcv/ops/cc_attention.py
@@ -6,7 +6,7 @@ import torch.nn.functional as F
 from mmcv.cnn import PLUGIN_LAYERS, Scale


-def NEG_INF_DIAG(n, device):
+def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
    """Returns a diagonal matrix of size [n, n].

    The diagonal are all "-inf". This is for avoiding calculating the
@@ -41,7 +41,7 @@ class CrissCrossAttention(nn.Module):
        in_channels (int): Channels of the input feature map.
    """

-    def __init__(self, in_channels):
+    def __init__(self, in_channels: int) -> None:
        super().__init__()
        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
@@ -49,14 +49,15 @@ class CrissCrossAttention(nn.Module):
        self.gamma = Scale(0.)
        self.in_channels = in_channels

-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """forward function of Criss-Cross Attention.

        Args:
-            x (Tensor): Input feature. \
-                shape (batch_size, in_channels, height, width)
+            x (torch.Tensor): Input feature with the shape of
+                (batch_size, in_channels, height, width).
+
        Returns:
-            Tensor: Output of the layer, with shape of \
+            torch.Tensor: Output of the layer, with the shape of
            (batch_size, in_channels, height, width)
        """
        B, C, H, W = x.size()
@@ -77,7 +78,7 @@ class CrissCrossAttention(nn.Module):

        return out

-    def __repr__(self):
+    def __repr__(self) -> str:
        s = self.__class__.__name__
        s += f'(in_channels={self.in_channels})'
        return s
--- a/mmcv/ops/chamfer_distance.py
+++ b/mmcv/ops/chamfer_distance.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
+
+
+class ChamferDistanceFunction(Function):
+    """This is an implementation of the 2D Chamfer Distance.
+
+    It has been used in the paper `Oriented RepPoints for Aerial Object
+    Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
+        """
+        Args:
+            xyz1 (Tensor): Point set with shape (B, N, 2).
+            xyz2 (Tensor): Point set with shape (B, N, 2).
+
+        Returns:
+            Sequence[Tensor]:
+
+                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
+                    shape (B, N).
+                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
+                    shape (B, N).
+                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+        """
+        batch_size, n, _ = xyz1.size()
+        _, m, _ = xyz2.size()
+        device = xyz1.device
+        xyz1 = xyz1.contiguous()
+        xyz2 = xyz2.contiguous()
+
+        dist1 = torch.zeros(batch_size, n).to(device)
+        dist2 = torch.zeros(batch_size, m).to(device)
+        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
+        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
+
+        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
+                                            idx2)
+        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
+        return dist1, dist2, idx1, idx2
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dist1: Tensor, grad_dist2: Tensor,
+                 grad_idx1: Tensor,
+                 grad_idx2: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+
+        Args:
+            grad_dist1 (Tensor): Gradient of chamfer distance
+                (xyz1 to xyz2) with shape (B, N).
+            grad_dist2 (Tensor): Gradient of chamfer distance
+                (xyz2 to xyz1) with shape (B, N).
+            grad_idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+                with shape (B, N), which be used in compute gradient.
+            grad_idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+                with shape (B, N), which be used in compute gradient.
+
+        Returns:
+            Tuple[Tensor, Tensor]:
+
+            - grad_xyz1 (Tensor): Gradient of the point set with shape \
+                (B, N, 2).
+            - grad_xyz2 (Tensor):Gradient of the point set with shape \
+                (B, N, 2).
+        """
+        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
+        device = grad_dist1.device
+        grad_dist1 = grad_dist1.contiguous()
+        grad_dist2 = grad_dist2.contiguous()
+        grad_xyz1 = torch.zeros(xyz1.size()).to(device)
+        grad_xyz2 = torch.zeros(xyz2.size()).to(device)
+
+        ext_module.chamfer_distance_backward(xyz1, xyz2, grad_xyz1, grad_xyz2,
+                                             grad_dist1, grad_dist2, idx1,
+                                             idx2)
+        return grad_xyz1, grad_xyz2
+
+
+chamfer_distance = ChamferDistanceFunction.apply
--- a/mmcv/ops/contour_expand.py
+++ b/mmcv/ops/contour_expand.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
 import numpy as np
 import torch

@@ -7,21 +9,22 @@ from ..utils import ext_loader
 ext_module = ext_loader.load_ext('_ext', ['contour_expand'])


-def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
-                   kernel_num):
+def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
+                   internal_kernel_label: Union[np.array, torch.Tensor],
+                   min_kernel_area: int, kernel_num: int) -> list:
    """Expand kernel contours so that foreground pixels are assigned into
    instances.

-    Arguments:
-        kernel_mask (np.array or Tensor): The instance kernel mask with
+    Args:
+        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
            size hxw.
-        internal_kernel_label (np.array or Tensor): The instance internal
+        internal_kernel_label (np.array or torch.Tensor): The instance internal
            kernel label with size hxw.
        min_kernel_area (int): The minimum kernel area.
        kernel_num (int): The instance kernel number.

    Returns:
-        label (list): The instance index map with size hxw.
+        list: The instance index map with size hxw.
    """
    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
@@ -42,7 +45,7 @@ def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
                internal_kernel_label,
                min_kernel_area=min_kernel_area,
                kernel_num=kernel_num)
-            label = label.tolist()
+            label = label.tolist()  # type: ignore
    else:
        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
                                          min_kernel_area, kernel_num)

--- a/mmcv/ops/convex_iou.py
+++ b/mmcv/ops/convex_iou.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
+
+
+def convex_giou(pointsets: torch.Tensor,
+                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Return generalized intersection-over-union (Jaccard index) between point
+    sets and polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (N, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
+        between point sets and polygons with the shape (N,). The second
+        element is the gradient of point sets with the shape (N, 18).
+    """
+    output = pointsets.new_zeros((pointsets.size(0), 19))
+    ext_module.convex_giou(pointsets, polygons, output)
+    convex_giou = output[:, -1]
+    points_grad = output[:, 0:-1]
+    return convex_giou, points_grad
+
+
+def convex_iou(pointsets: torch.Tensor,
+               polygons: torch.Tensor) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) between point sets and
+    polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (K, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        torch.Tensor: Return the ious between point sets and polygons with the
+        shape (N, K).
+    """
+    N, K = pointsets.size(0), polygons.size(0)
+    ious = pointsets.new_zeros((N, K))
+    ext_module.convex_iou(pointsets, polygons, ious)
+    return ious
--- a/mmcv/ops/corner_pool.py
+++ b/mmcv/ops/corner_pool.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
-from torch import nn
+from torch import Tensor, nn
 from torch.autograd import Function

-from ..utils import ext_loader
+_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}

-ext_module = ext_loader.load_ext('_ext', [
-    'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
-    'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
-    'right_pool_forward', 'right_pool_backward'
-])

-_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
+def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
+    size = x.size(dim)
+    output = x.clone()
+
+    ind = 1
+    while ind < size:
+        if flip:
+            cur_start = 0
+            cur_len = size - ind
+            next_start = ind
+            next_len = size - ind
+        else:
+            cur_start = ind
+            cur_len = size - ind
+            next_start = 0
+            next_len = size - ind
+
+        # max_temp should be cloned for backward computation
+        max_temp = output.narrow(dim, cur_start, cur_len).clone()
+        cur_temp = output.narrow(dim, cur_start, cur_len)
+        next_temp = output.narrow(dim, next_start, next_len)
+
+        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
+
+        ind = ind << 1
+
+    return output


 class TopPoolFunction(Function):

    @staticmethod
-    def symbolic(g, input):
+    def symbolic(g, input: Tensor) -> Tensor:
        output = g.op(
            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
        return output

    @staticmethod
-    def forward(ctx, input):
-        output = ext_module.top_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.top_pool_backward(input, grad_output)
-        return output
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 2, True)


 class BottomPoolFunction(Function):

    @staticmethod
-    def symbolic(g, input):
+    def symbolic(g, input: Tensor) -> Tensor:
        output = g.op(
            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
        return output

    @staticmethod
-    def forward(ctx, input):
-        output = ext_module.bottom_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.bottom_pool_backward(input, grad_output)
-        return output
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 2, False)


 class LeftPoolFunction(Function):

    @staticmethod
-    def symbolic(g, input):
+    def symbolic(g, input: Tensor) -> Tensor:
        output = g.op(
            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
        return output

    @staticmethod
-    def forward(ctx, input):
-        output = ext_module.left_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.left_pool_backward(input, grad_output)
-        return output
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 3, True)


 class RightPoolFunction(Function):

    @staticmethod
-    def symbolic(g, input):
+    def symbolic(g, input: Tensor) -> Tensor:
        output = g.op(
            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
        return output

    @staticmethod
-    def forward(ctx, input):
-        output = ext_module.right_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.right_pool_backward(input, grad_output)
-        return output
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 3, False)


 class CornerPool(nn.Module):
@@ -104,11 +93,13 @@ class CornerPool(nn.Module):
    Corner Pooling is a new type of pooling layer that helps a
    convolutional network better localize corners of bounding boxes.

-    Please refer to https://arxiv.org/abs/1808.01244 for more details.
+    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
+    <https://arxiv.org/abs/1808.01244>`_ for more details.
+
    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.

    Args:
-        mode(str): Pooling orientation for the pooling layer
+        mode (str): Pooling orientation for the pooling layer

            - 'bottom': Bottom Pooling
            - 'left': Left Pooling
@@ -133,13 +124,13 @@ class CornerPool(nn.Module):
        'top': (2, True),
    }

-    def __init__(self, mode):
-        super(CornerPool, self).__init__()
+    def __init__(self, mode: str):
+        super().__init__()
        assert mode in self.pool_functions
        self.mode = mode
-        self.corner_pool = self.pool_functions[mode]
+        self.corner_pool: Function = self.pool_functions[mode]

-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
        if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
            if torch.onnx.is_in_onnx_export():
                assert torch.__version__ >= '1.7.0', \
@@ -158,4 +149,8 @@ class CornerPool(nn.Module):
                pool_tensor = pool_tensor.flip(dim)
            return pool_tensor
        else:
-            return self.corner_pool.apply(x)
+            if torch.onnx.is_in_onnx_export():
+                return self.corner_pool.apply(x)
+            else:
+                dim, flip = self.cummax_dim_flip[self.mode]
+                return _corner_pool(x, dim, flip)
--- a/mmcv/ops/correlation.py
+++ b/mmcv/ops/correlation.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
 from torch import Tensor, nn
 from torch.autograd import Function
@@ -15,14 +17,14 @@ class CorrelationFunction(Function):

    @staticmethod
    def forward(ctx,
-                input1,
-                input2,
-                kernel_size=1,
-                max_displacement=1,
-                stride=1,
-                padding=1,
-                dilation=1,
-                dilation_patch=1):
+                input1: Tensor,
+                input2: Tensor,
+                kernel_size: int = 1,
+                max_displacement: int = 1,
+                stride: int = 1,
+                padding: int = 1,
+                dilation: int = 1,
+                dilation_patch: int = 1) -> Tensor:

        ctx.save_for_backward(input1, input2)

@@ -60,7 +62,9 @@ class CorrelationFunction(Function):

    @staticmethod
    @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
        input1, input2 = ctx.saved_tensors

        kH, kW = ctx.kernel_size

--- a/mmcv/ops/csrc/README.md
+++ b/mmcv/ops/csrc/README.md
@@ -13,11 +13,19 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── pytorch_cpp_helper.hpp
 │   ├── pytorch_cuda_helper.hpp
 │   ├── pytorch_device_registry.hpp
-│   └── cuda
-│       ├── common_cuda_helper.hpp
-│       ├── parrots_cudawarpfunction.cuh
-│       ├── ...
-│       └── ops_cuda_kernel.cuh
+│   ├── cuda
+│   │   ├── common_cuda_helper.hpp
+│   │   ├── parrots_cudawarpfunction.cuh
+│   │   ├── ...
+│   │   └── ops_cuda_kernel.cuh
+|   ├── mps
+│   │   ├── MPSLibrary.h
+│   │   ├── ...
+│   │   └── MPSUtils.h
+|   ├── mlu
+│   │   └── ...
+|   └── utils
+│   │   └── ...
 ├── onnxruntime
 │   ├── onnxruntime_register.h
 │   ├── onnxruntime_session_options_config_keys.h
@@ -41,9 +49,15 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── cuda
 │   │   ├── ...
 │   │   └── ops_cuda.cu
-│   └── cpu
+│   ├── cpu
+│   │   ├── ...
+│   │   └── ops.cpp
+│   ├── mps
+│   │   ├── ...
+│   |   └── op_mps.mm
+│   └── mlu
 │       ├── ...
-│       └── ops.cpp
+│       └── op_mlu.cpp
 └── tensorrt
    ├── trt_cuda_helper.cuh
    ├── trt_plugin_helper.hpp
@@ -63,108 +77,113 @@ This folder contains all non-python code for MMCV custom ops. Please follow the

 - `common`: This directory contains all tools and shared codes.
  - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
- `onnxruntime`: **ONNX Runtime** support for custom ops.
+  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
+  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
+  - `utils`: The kernels and utils of spconv.
+- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
  - `cpu`: CPU implementation of supported ops.
 - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
 - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
- `tensorrt`: **TensorRT** support for custom ops.
+  - `mlu`: This directory contain launchers of each MLU kernels.
+  - `mps`: MPS ops implementation and launchers.
+- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
  - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.

 ## How to add new PyTorch ops?

 1. (Optional) Add shared kernel in `common` to support special hardware platform.

-    ```c++
-    // src/common/cuda/new_ops_cuda_kernel.cuh
-
-    template <typename T>
-    __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
-        // forward here
-    }
-
-    ```
-
-    Add cuda kernel launcher in `pytorch/cuda`.
-
-    ```c++
-    // src/pytorch/cuda
-    #include <new_ops_cuda_kernel.cuh>
-
-    void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
-        // initialize
-        at::cuda::CUDAGuard device_guard(input.device());
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        ...
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-            input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
-                new_ops_forward_cuda_kernel<scalar_t>
-                    <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                        input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
-            }));
-        AT_CUDA_CHECK(cudaGetLastError());
-    }
-    ```
+   ```c++
+   // src/common/cuda/new_ops_cuda_kernel.cuh
+
+   template <typename T>
+   __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
+       // forward here
+   }
+
+   ```
+
+   Add cuda kernel launcher in `pytorch/cuda`.
+
+   ```c++
+   // src/pytorch/cuda
+   #include <new_ops_cuda_kernel.cuh>
+
+   void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
+       // initialize
+       at::cuda::CUDAGuard device_guard(input.device());
+       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+       ...
+       AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+           input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
+               new_ops_forward_cuda_kernel<scalar_t>
+                   <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                       input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
+           }));
+       AT_CUDA_CHECK(cudaGetLastError());
+   }
+   ```

 2. Register implementation for different devices.

-    ```c++
-    // src/pytorch/cuda/cudabind.cpp
-    ...
+   ```c++
+   // src/pytorch/cuda/cudabind.cpp
+   ...

-    Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
-        // implement cuda forward here
-        // use `NewOpsForwardCUDAKernelLauncher` here
-    }
-    // declare interface here.
-    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
-    // register the implementation for given device (CUDA here).
-    REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
-    ```
+   Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
+       // implement cuda forward here
+       // use `NewOpsForwardCUDAKernelLauncher` here
+   }
+   // declare interface here.
+   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+   // register the implementation for given device (CUDA here).
+   REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+   ```

 3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.

-    ```c++
-    // src/pytorch/new_ops.cpp
-    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
-        // dispatch the implementation according to the device type of input.
-        DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
-    }
-    ...
+   ```c++
+   // src/pytorch/new_ops.cpp
+   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+       // dispatch the implementation according to the device type of input.
+       DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+   }
+   ...

-    Tensor new_ops_forward(Tensor input, Tensor output, ...){
-        return new_ops_forward_impl(input, output, ...);
-    }
-    ```
+   Tensor new_ops_forward(Tensor input, Tensor output, ...){
+       return new_ops_forward_impl(input, output, ...);
+   }
+   ```

 4. Binding the implementation in `pytorch/pybind.cpp`

-    ```c++
-    // src/pytorch/pybind.cpp
+   ```c++
+   // src/pytorch/pybind.cpp

-    ...
+   ...

-    Tensor new_ops_forward(Tensor input, Tensor output, ...);
+   Tensor new_ops_forward(Tensor input, Tensor output, ...);

-    ...
+   ...

-    // bind with pybind11
-    m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
-            py::arg("input"), py::arg("output"), ...);
+   // bind with pybind11
+   m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
+           py::arg("input"), py::arg("output"), ...);

-    ...
+   ...

-    ```
+   ```

 5. Build MMCV again. Enjoy new ops in python

-    ```python
-    from ..utils import ext_loader
-    ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
+   ```python
+   from ..utils import ext_loader
+   ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])

-    ...
+   ...

-    ext_module.new_ops_forward(input, output, ...)
+   ext_module.new_ops_forward(input, output, ...)

-    ```
+   ```
--- a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
+++ b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -220,6 +220,10 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
                return temp > 0;
              }
            });
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
 #endif

  // Step 4:

--- a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_forward_cuda_kernel(
+    const int nthreads, const scalar_t* weight_data, const int* indices_data,
+    const int num_input_planes, const int num_output_planes,
+    const int num_orientations, const int num_rotations, const int nEntry,
+    scalar_t* output_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t val = *(weight_data + index);
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t* target = output_data +
+                         i * (num_rotations * num_input_planes * nEntry) +
+                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
+      *target = val;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_backward_cuda_kernel(
+    const int nthreads, const scalar_t* gradWeight_data,
+    const int* indices_data, const int num_input_planes,
+    const int num_output_planes, const int num_orientations,
+    const int num_rotations, const int nEntry, scalar_t* weight_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t* val = weight_data + index;
+    *val = 0;
+    scalar_t tmp = 0;
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t target =
+          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
+            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
+      tmp = tmp + target;
+    }
+    *val = tmp;
+  }
+}
+#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel(
    const int O, const int aggregate, const T* points, const T* centers,
    const T* scores, const int64_t* knn_idx, T* output) {
  // ----- parallel loop for B, N1, K and O ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= B * N1 * K * O) return;
-  // ------- loop for M ----------
-  const int b = (int)(i / (O * N1 * K));
-  const int o = (int)(i % (O * N1 * K) / (N1 * K));
-  const int n = (int)(i % (N1 * K) / K);
-  const int k = (int)(i % K);
-  const int cn = (int)knn_idx[b * K * N1 + n * K +
-                              0];  // The first neighbor is the center point
-  const int kn = (int)knn_idx[b * K * N1 + n * K + k];
-  if (kn >= N0 ||
-      kn < 0) {  // if index overflows, it is out of the neighborhood range
-    return;
-  }
-  assert(b < B);
-  assert(kn < N0);
-  assert(cn < N0);
-  assert(o < O);
-  assert(n < N1);
-  const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
-  T val = output[out_idx];
-  for (int m = 0; m < M; m++) {
-    val += points[b * N0 * M * O + kn * M * O + m * O + o] *
-               scores[b * N1 * K * M + n * K * M + k * M + m] -
-           centers[b * N0 * M * O + cn * M * O + m * O + o] *
-               scores[b * N1 * K * M + n * K * M + k * M + m];
+  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
+    // ------- loop for M ----------
+    const int b = (int)(i / (O * N1 * K));
+    const int o = (int)(i % (O * N1 * K) / (N1 * K));
+    const int n = (int)(i % (N1 * K) / K);
+    const int k = (int)(i % K);
+    const int cn = (int)knn_idx[b * K * N1 + n * K +
+                                0];  // The first neighbor is the center point
+    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+    assert(b < B);
+    assert(kn < N0);
+    assert(cn < N0);
+    assert(o < O);
+    assert(n < N1);
+    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+    T val = output[out_idx];
+    for (int m = 0; m < M; m++) {
+      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m] -
+             centers[b * N0 * M * O + cn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m];
+    }
+    output[out_idx] = val;
  }
-  output[out_idx] = val;
 }

 template <typename T>
@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
    const int O, const int aggregate, const T* grad_out, const T* scores,
    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
  // ----- parallel loop for B, M, O ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= B * M * O) return;
-  int b = (int)(i / (M * O));
-  int m = (int)(i % (M * O) / O);
-  int o = (int)(i % O);
+  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
+    int b = (int)(i / (M * O));
+    int m = (int)(i % (M * O) / O);
+    int o = (int)(i % O);

-  // ----- loop for N,K ---------
-  for (int n = 0; n < N; n++) {
-    for (int k = 0; k < K; k++) {
-      int kn = knn_idx[b * N * K + n * K + k];
-      int cn = knn_idx[b * N * K + n * K + 0];
-      if (kn >= N0 ||
-          kn < 0) {  // if index overflows, it is out of the neighborhood range
-        continue;
+    // ----- loop for N,K ---------
+    for (int n = 0; n < N; n++) {
+      for (int k = 0; k < K; k++) {
+        int kn = knn_idx[b * N * K + n * K + k];
+        int cn = knn_idx[b * N * K + n * K + 0];
+        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
+                                   // neighborhood range
+          continue;
+        }
+        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                  scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                  -scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
      }
-      atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
-                scores[b * N * K * M + n * K * M + k * M + m] *
-                    grad_out[b * O * N * K + o * N * K + n * K + k]);
-      atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-                -scores[b * N * K * M + n * K * M + k * M + m] *
-                    grad_out[b * O * N * K + o * N * K + n * K + k]);
    }
  }
 }
@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
    const int O, const int aggregate, const T* grad_out, const T* points,
    const T* centers, const int64_t* knn_idx, T* grad_scores) {
  // ----- parallel loop for B, N, K, M ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= B * N * K * M) return;
-  const int b = (int)(i / (N * M * K));
-  const int n = (int)(i % (N * M * K) / M / K);
-  const int k = (int)(i % (M * K) / M);
-  const int m = (int)(i % M);
-  const int cn = knn_idx[b * N * K + n * K + 0];
-  const int kn = knn_idx[b * N * K + n * K + k];
-  if (kn >= N0 ||
-      kn < 0) {  // if index overflows, it is out of the neighborhood range
-    return;
-  }
+  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
+    const int b = (int)(i / (N * M * K));
+    const int n = (int)(i % (N * M * K) / M / K);
+    const int k = (int)(i % (M * K) / M);
+    const int m = (int)(i % M);
+    const int cn = knn_idx[b * N * K + n * K + 0];
+    const int kn = knn_idx[b * N * K + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }

-  // -------------- loop for O ------------------------
-  const int out_idx = b * N * K * M + n * K * M + k * M + m;
-  T val = grad_scores[out_idx];
-  for (int o = 0; o < O; o++) {
-    val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
-            centers[b * N0 * M * O + cn * M * O + m * O + o]) *
-           grad_out[b * O * N * K + o * N * K + n * K + k];
+    // -------------- loop for O ------------------------
+    const int out_idx = b * N * K * M + n * K * M + k * M + m;
+    T val = grad_scores[out_idx];
+    for (int o = 0; o < O; o++) {
+      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+             grad_out[b * O * N * K + o * N * K + n * K + k];
+    }
+    grad_scores[out_idx] = val;
  }
-  grad_scores[out_idx] = val;
 }

 #endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -21,35 +21,36 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
  // output:
  //      idx: (B, M, nsample)
  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || pt_idx >= m) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;

-  new_xyz += bs_idx * m * 3 + pt_idx * 3;
-  xyz += bs_idx * n * 3;
-  idx += bs_idx * m * nsample + pt_idx * nsample;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;

-  float max_radius2 = max_radius * max_radius;
-  float min_radius2 = min_radius * min_radius;
-  T new_x = new_xyz[0];
-  T new_y = new_xyz[1];
-  T new_z = new_xyz[2];
+    float max_radius2 = max_radius * max_radius;
+    float min_radius2 = min_radius * min_radius;
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];

-  int cnt = 0;
-  for (int k = 0; k < n; ++k) {
-    T x = xyz[k * 3 + 0];
-    T y = xyz[k * 3 + 1];
-    T z = xyz[k * 3 + 2];
-    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-           (new_z - z) * (new_z - z);
-    if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
-      if (cnt == 0) {
-        for (int l = 0; l < nsample; ++l) {
-          idx[l] = k;
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      T x = xyz[k * 3 + 0];
+      T y = xyz[k * 3 + 1];
+      T z = xyz[k * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[l] = k;
+          }
        }
+        idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
      }
-      idx[cnt] = k;
-      ++cnt;
-      if (cnt >= nsample) break;
    }
  }
 }

--- a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -8,6 +8,27 @@
 #include "pytorch_cuda_helper.hpp"
 #endif

+template <typename T>
+__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
+                                          T& y1, T& x2, T& y2) {
+  x1 = bbox[base];
+  y1 = bbox[base + 1];
+  x2 = bbox[base + 2];
+  y2 = bbox[base + 3];
+}
+
+template <>
+__device__ __forceinline__ void load_bbox<float>(const float* bbox,
+                                                 const int base, float& x1,
+                                                 float& y1, float& x2,
+                                                 float& y2) {
+  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
+  x1 = bbox_offset.x;
+  y1 = bbox_offset.y;
+  x2 = bbox_offset.z;
+  y2 = bbox_offset.w;
+}
+
 template <typename T>
 __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                          T* ious, const int num_bbox1,
@@ -16,69 +37,111 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                          const int offset) {
  if (aligned) {
    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
-      int b1 = index;
-      int b2 = index;
-
-      int base1 = b1 * 4;
-      T b1_x1 = bbox1[base1];
-      T b1_y1 = bbox1[base1 + 1];
-      T b1_x2 = bbox1[base1 + 2];
-      T b1_y2 = bbox1[base1 + 3];
-      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
-
-      int base2 = b2 * 4;
-      T b2_x1 = bbox2[base2];
-      T b2_y1 = bbox2[base2 + 1];
-      T b2_x2 = bbox2[base2 + 2];
-      T b2_y2 = bbox2[base2 + 3];
-      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
-
-      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
-      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
-      T width = fmaxf(right - left + offset, 0.f);
-      T height = fmaxf(bottom - top + offset, 0.f);
-      T interS = width * height;
-      T baseS = 1.0;
-      if (mode == 0) {
-        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
-      } else if (mode == 1) {
-        baseS = fmaxf(b1_area, T(offset));
-      }
+      const int b1 = index;
+      const int b2 = index;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
      ious[index] = interS / baseS;
    }
  } else {
    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
-      int b1 = index / num_bbox2;
-      int b2 = index % num_bbox2;
-
-      int base1 = b1 * 4;
-      T b1_x1 = bbox1[base1];
-      T b1_y1 = bbox1[base1 + 1];
-      T b1_x2 = bbox1[base1 + 2];
-      T b1_y2 = bbox1[base1 + 3];
-      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
-
-      int base2 = b2 * 4;
-      T b2_x1 = bbox2[base2];
-      T b2_y1 = bbox2[base2 + 1];
-      T b2_x2 = bbox2[base2 + 2];
-      T b2_y2 = bbox2[base2 + 3];
-      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
-
-      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
-      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
-      T width = fmaxf(right - left + offset, 0.f);
-      T height = fmaxf(bottom - top + offset, 0.f);
-      T interS = width * height;
-      T baseS = 1.0;
-      if (mode == 0) {
-        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
-      } else if (mode == 1) {
-        baseS = fmaxf(b1_area, T(offset));
-      }
+      const int b1 = index / num_bbox2;
+      const int b2 = index % num_bbox2;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
      ious[index] = interS / baseS;
    }
  }
 }

+#if __CUDA_ARCH__ >= 530
+__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
+                                              const __half x2, const __half y2,
+                                              const __half offset) {
+  const __half half_w = __hadd(__hsub(x2, x1), offset);
+  const __half half_h = __hadd(__hsub(y2, y1), offset);
+  return __hmul(half_w, half_h);
+}
+
+__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
+  return __hge(a, b) ? a : b;
+}
+
+__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
+  return __hle(a, b) ? a : b;
+}
+
+// fp16 won't provide much increase when aligned==true. It is useful when
+// aligned==false, which would give you ~40% bonus.
+__device__ void bbox_overlaps_cuda_kernel_half(
+    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
+    const int num_bbox2, const int mode, const bool aligned, const int offset) {
+  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
+  const __half h_offset = __int2half_rn(offset);
+  CUDA_1D_KERNEL_LOOP(index, num_output) {
+    const int b1 = aligned ? index : index / num_bbox2;
+    const int b2 = aligned ? index : index % num_bbox2;
+
+    const int base1 = b1 << 2;
+    __half b1_x1, b1_y1, b1_x2, b1_y2;
+    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
+
+    const int base2 = b2 << 2;
+    __half b2_x1, b2_y1, b2_x2, b2_y2;
+    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
+
+    const __half left = __half_max(b1_x1, b2_x1),
+                 right = __half_min(b1_x2, b2_x2);
+    const __half top = __half_max(b1_y1, b2_y1),
+                 bottom = __half_min(b1_y2, b2_y2);
+    const __half width =
+        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
+    const __half height =
+        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
+    const __half interS = __hmul(width, height);
+
+    const __half baseS = __half_max(
+        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
+        h_offset);
+    ious[index] = __hdiv(interS, baseS);
+  }
+}
+#endif  // __CUDA_ARCH__ >= 530
+
 #endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH