release v1.6.1 of mmcv

fdeee889 · limm · df465820 · fdeee889 · fdeee889 · fdeee889
Commit fdeee889 authored May 25, 2025 by limm
20 changed files
--- a/mmcv/ops/csrc/pytorch/pybind.cpp
+++ b/mmcv/ops/csrc/pytorch/pybind.cpp
 // Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
 #include "pytorch_cpp_helper.hpp"

 std::string get_compiler_version();
@@ -113,17 +115,15 @@ void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,

 void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+
 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap);

-void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                 Tensor ans_iou);
-
-void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                       float nms_overlap_thresh);
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);

-void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                              float nms_overlap_thresh);
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);

 void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                     Tensor idx_tensor, int b, int n, int m);
@@ -240,21 +240,54 @@ void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                        Tensor idx_tensor, int b, int n, int m,
                        float min_radius, float max_radius, int nsample);

-Tensor bottom_pool_forward(Tensor input);
-
-Tensor bottom_pool_backward(Tensor input, Tensor grad_output);
-
-Tensor left_pool_forward(Tensor input);
-
-Tensor left_pool_backward(Tensor input, Tensor grad_output);
-
-Tensor right_pool_forward(Tensor input);
-
-Tensor right_pool_backward(Tensor input, Tensor grad_output);
-
-Tensor top_pool_forward(Tensor input);
-
-Tensor top_pool_backward(Tensor input, Tensor grad_output);
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale);
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale);
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<Tensor> get_indice_pairs_backward(
+    Tensor indices, Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
+                           Tensor indiceNum, int64_t numActOut,
+                           int64_t _inverse, int64_t _subM);
+
+std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
+                                         Tensor outGrad, Tensor indicePairs,
+                                         Tensor indiceNum, int64_t _inverse,
+                                         int64_t _subM);
+
+Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
+                                           Tensor bias, Tensor indicePairs,
+                                           Tensor indiceNum, int64_t numActOut,
+                                           int64_t _inverse, int64_t _subM);
+
+Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
+                              Tensor indiceNum, int64_t numAct);
+
+Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
+                               Tensor outGrad, Tensor indicePairs,
+                               Tensor indiceNum);

 void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                     const int mode_flag, const bool aligned);
@@ -273,13 +306,14 @@ Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,

 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
                               int pooled_height, int pooled_width,
-                               float spatial_scale, int sample_num,
+                               float spatial_scale, int sampling_ratio,
                               bool aligned, bool clockwise);

 void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
                                Tensor grad_input, int pooled_height,
                                int pooled_width, float spatial_scale,
-                                int sample_num, bool aligned, bool clockwise);
+                                int sampling_ratio, bool aligned,
+                                bool clockwise);

 std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
    const torch::Tensor &feats, const torch::Tensor &coors,
@@ -298,7 +332,8 @@ void hard_voxelize_forward(const at::Tensor &points,
                           const at::Tensor &coors_range, at::Tensor &voxels,
                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
                           at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim);
+                           const int max_voxels, const int NDim,
+                           const bool deterministic);

 void dynamic_voxelize_forward(const at::Tensor &points,
                              const at::Tensor &voxel_size,
@@ -340,6 +375,54 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW);

+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+at::Tensor diff_iou_rotated_sort_vertices_forward(at::Tensor vertices,
+                                                  at::Tensor mask,
+                                                  at::Tensor num_valid);
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx);
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor gradxyz1, Tensor gradxyz2,
+                               Tensor graddist1, Tensor graddist2, Tensor idx1,
+                               Tensor idx2);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
        py::arg("kernel"), py::arg("up_x"), py::arg("up_y"), py::arg("down_x"),
@@ -395,21 +478,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
        py::arg("input"), py::arg("weight"), py::arg("offset"),
        py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
-        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padH"),
-        py::arg("padW"), py::arg("dilationW"), py::arg("dilationH"),
+        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
+        py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
        py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
  m.def("deform_conv_backward_input", &deform_conv_backward_input,
        "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
        py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
        py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
-        py::arg("dW"), py::arg("dH"), py::arg("padH"), py::arg("padW"),
+        py::arg("dW"), py::arg("dH"), py::arg("padW"), py::arg("padH"),
        py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
        py::arg("deformable_group"), py::arg("im2col_step"));
  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
        "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
        py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
        py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
-        py::arg("dH"), py::arg("padH"), py::arg("padW"), py::arg("dilationW"),
+        py::arg("dH"), py::arg("padW"), py::arg("padH"), py::arg("dilationW"),
        py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
        py::arg("scale"), py::arg("im2col_step"));
  m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
@@ -473,15 +556,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        py::arg("dist2_tensor"));
  m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
        "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
-        py::arg("boxes_b"), py::arg("ans_overlap"));
-  m.def("iou3d_boxes_iou_bev_forward", &iou3d_boxes_iou_bev_forward,
-        "iou3d_boxes_iou_bev_forward", py::arg("boxes_a"), py::arg("boxes_b"),
-        py::arg("ans_iou"));
-  m.def("iou3d_nms_forward", &iou3d_nms_forward, "iou3d_nms_forward",
+        py::arg("boxes_b"), py::arg("ans_iou"));
+  m.def("iou3d_nms3d_forward", &iou3d_nms3d_forward, "iou3d_nms3d_forward",
        py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
        py::arg("nms_overlap_thresh"));
-  m.def("iou3d_nms_normal_forward", &iou3d_nms_normal_forward,
-        "iou3d_nms_normal_forward", py::arg("boxes"), py::arg("keep"),
+  m.def("iou3d_nms3d_normal_forward", &iou3d_nms3d_normal_forward,
+        "iou3d_nms3d_normal_forward", py::arg("boxes"), py::arg("keep"),
        py::arg("num_out"), py::arg("nms_overlap_thresh"));
  m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
        "furthest_point_sampling_forward", py::arg("points_tensor"),
@@ -567,6 +647,54 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
        py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
        py::arg("std"), py::arg("grad_input"));
+  m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
+        "get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
+        "get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
+        "get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
+        "get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
+        "get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
+        py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
+        py::arg("features"), py::arg("filters"), py::arg("outGrad"),
+        py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
+        "fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
+        py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
+        py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
+  m.def("indice_maxpool_forward", &indice_maxpool_forward,
+        "indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numAct"));
+  m.def("indice_maxpool_backward", &indice_maxpool_backward,
+        "indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
+        py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
  m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
        py::arg("input"), py::arg("output"), py::arg("psa_type"),
        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
@@ -581,26 +709,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        py::arg("input"), py::arg("shift"), py::arg("output"));
  m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
        py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
-  m.def("bottom_pool_forward", &bottom_pool_forward, "Bottom Pool Forward",
-        py::arg("input"), py::call_guard<py::gil_scoped_release>());
-  m.def("bottom_pool_backward", &bottom_pool_backward, "Bottom Pool Backward",
-        py::arg("input"), py::arg("grad_output"),
-        py::call_guard<py::gil_scoped_release>());
-  m.def("left_pool_forward", &left_pool_forward, "Left Pool Forward",
-        py::arg("input"), py::call_guard<py::gil_scoped_release>());
-  m.def("left_pool_backward", &left_pool_backward, "Left Pool Backward",
-        py::arg("input"), py::arg("grad_output"),
-        py::call_guard<py::gil_scoped_release>());
-  m.def("right_pool_forward", &right_pool_forward, "Right Pool Forward",
-        py::arg("input"), py::call_guard<py::gil_scoped_release>());
-  m.def("right_pool_backward", &right_pool_backward, "Right Pool Backward",
-        py::arg("input"), py::arg("grad_output"),
-        py::call_guard<py::gil_scoped_release>());
-  m.def("top_pool_forward", &top_pool_forward, "Top Pool Forward",
-        py::arg("input"), py::call_guard<py::gil_scoped_release>());
-  m.def("top_pool_backward", &top_pool_backward, "Top Pool Backward",
-        py::arg("input"), py::arg("grad_output"),
-        py::call_guard<py::gil_scoped_release>());
  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
        py::arg("mode_flag"), py::arg("aligned"));
@@ -614,13 +722,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
        "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
-        py::arg("spatial_scale"), py::arg("sample_num"), py::arg("aligned"),
+        py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"),
        py::arg("clockwise"));
  m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
        "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
        py::arg("grad_output"), py::arg("pooled_height"),
        py::arg("pooled_width"), py::arg("spatial_scale"),
-        py::arg("sample_num"), py::arg("aligned"), py::arg("clockwise"));
+        py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise"));
  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
        "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
        py::arg("reduce_type"));
@@ -633,7 +741,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
        py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
        py::arg("num_points_per_voxel"), py::arg("voxel_num"),
-        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"));
+        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
+        py::arg("deterministic"));
  m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
        "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
        py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
@@ -686,4 +795,62 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
        py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
        py::arg("pool_method"));
+  m.def("rotated_feature_align_forward", &rotated_feature_align_forward,
+        "Feature Refine forward (CUDA)", py::arg("features"),
+        py::arg("best_bboxes"), py::arg("output"), py::arg("spatial_scale"),
+        py::arg("points"));
+  m.def("rotated_feature_align_backward", &rotated_feature_align_backward,
+        "Feature Refine backward (CUDA)", py::arg("top_grad"),
+        py::arg("best_bboxes"), py::arg("bottom_grad"),
+        py::arg("spatial_scale"), py::arg("points"));
+  m.def("riroi_align_rotated_forward", &riroi_align_rotated_forward,
+        "riroi_align_rotated forward", py::arg("features"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("num_samples"),
+        py::arg("num_orientations"), py::arg("clockwise"));
+  m.def("riroi_align_rotated_backward", &riroi_align_rotated_backward,
+        "riroi_align_rotated backward", py::arg("top_grad"), py::arg("rois"),
+        py::arg("bottom_grad"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("num_samples"), py::arg("num_orientations"),
+        py::arg("clockwise"));
+  m.def("points_in_polygons_forward", &points_in_polygons_forward,
+        "points_in_polygons_forward", py::arg("points"), py::arg("polygons"),
+        py::arg("output"));
+  m.def("min_area_polygons", &min_area_polygons, "min_area_polygons",
+        py::arg("pointsets"), py::arg("polygons"));
+  m.def("active_rotated_filter_forward", &active_rotated_filter_forward,
+        "active_rotated_filter_forward", py::arg("input"), py::arg("indices"),
+        py::arg("output"));
+  m.def("active_rotated_filter_backward", &active_rotated_filter_backward,
+        "active_rotated_filter_backward", py::arg("grad_out"),
+        py::arg("indices"), py::arg("grad_in"));
+  m.def("convex_iou", &convex_iou, "convex_iou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("ious"));
+  m.def("convex_giou", &convex_giou, "convex_giou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("output"));
+  m.def("diff_iou_rotated_sort_vertices_forward",
+        &diff_iou_rotated_sort_vertices_forward,
+        "diff_iou_rotated_sort_vertices_forward", py::arg("vertices"),
+        py::arg("mask"), py::arg("num_valid"));
+  m.def("chamfer_distance_forward", &chamfer_distance_forward,
+        "chamfer_distance_forward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("dist1"), py::arg("dist2"), py::arg("idx1"), py::arg("idx2"));
+  m.def("chamfer_distance_backward", &chamfer_distance_backward,
+        "chamfer_distance_backward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("gradxyz1"), py::arg("gradxyz2"), py::arg("graddist1"),
+        py::arg("graddist2"), py::arg("idx1"), py::arg("idx2"));
+  m.def("prroi_pool_forward", &prroi_pool_forward, "prroi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_backward", &prroi_pool_backward, "prroi_pool_backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("grad_input"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_coor_backward", &prroi_pool_coor_backward,
+        "prroi_pool_coor_backward", py::arg("output"), py::arg("grad_output"),
+        py::arg("input"), py::arg("rois"), py::arg("grad_rois"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
 }
--- a/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
--- a/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
@@ -2,23 +2,23 @@
 #include "pytorch_cpp_helper.hpp"
 #include "pytorch_device_registry.hpp"

-void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
-                                    float spatial_scale, int sample_ratio,
+                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise) {
-  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, input, rois, output,
                       aligned_height, aligned_width, spatial_scale,
-                       sample_ratio, aligned, clockwise);
+                       sampling_ratio, aligned, clockwise);
 }

 void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
-                                     int sample_ratio, bool aligned,
+                                     int sampling_ratio, bool aligned,
                                     bool clockwise) {
  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
                       bottom_grad, aligned_height, aligned_width,
-                       spatial_scale, sample_ratio, aligned, clockwise);
+                       spatial_scale, sampling_ratio, aligned, clockwise);
 }

 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,

--- a/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
+++ b/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
--- a/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
+++ b/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
+                              indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_forward(torch::Tensor features,
+                                     torch::Tensor indicePairs,
+                                     torch::Tensor indiceNum, int64_t numAct) {
+  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
+                              outFeatures, outGrad, indicePairs, indiceNum);
+}
+
+torch::Tensor indice_maxpool_backward(torch::Tensor features,
+                                      torch::Tensor outFeatures,
+                                      torch::Tensor outGrad,
+                                      torch::Tensor indicePairs,
+                                      torch::Tensor indiceNum) {
+  return indice_maxpool_backward_impl(features, outFeatures, outGrad,
+                                      indicePairs, indiceNum);
+}
--- a/mmcv/ops/csrc/pytorch/spconv_ops.cpp
+++ b/mmcv/ops/csrc/pytorch/spconv_ops.cpp
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
+      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+      padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
+      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+      stride, padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+
+    return get_indice_pairs_forward_cuda<NDim>(
+        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+        padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+    CHECK_CUDA_INPUT(gridOut);
+
+    return get_indice_pairs_backward_cuda<NDim>(
+        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+        stride, padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
+                              indicePairs, indiceNum, numActOut, _inverse,
+                              _subM);
+}
+
+torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
+                                  torch::Tensor indicePairs,
+                                  torch::Tensor indiceNum, int64_t numActOut,
+                                  int64_t _inverse, int64_t _subM) {
+  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
+                                  numActOut, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
+                              outGrad, indicePairs, indiceNum, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
+                                   indiceNum, _inverse, _subM);
+}
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
--- a/mmcv/ops/csrc/pytorch/spconv_utils.h
+++ b/mmcv/ops/csrc/pytorch/spconv_utils.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/script.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace tv {
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  virtual cudaStream_t getStream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+
+struct TorchGPU : public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+
+template <typename scalar_t>
+void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+    case at::ScalarType::Double: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Float: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Int: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Half: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Long: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    default:
+      TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+
+template <typename scalar_t>
+tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<scalar_t>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<scalar_t>(
+      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
+}
+}  // namespace tv
--- a/mmcv/ops/csrc/pytorch/voxelization.cpp
+++ b/mmcv/ops/csrc/pytorch/voxelization.cpp
@@ -14,6 +14,17 @@ int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                              max_points, max_voxels, NDim);
 }

+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
+
 void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
@@ -27,7 +38,8 @@ void hard_voxelize_forward(const at::Tensor &points,
                           const at::Tensor &coors_range, at::Tensor &voxels,
                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
                           at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim = 3) {
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
  std::vector<float> voxel_size_v(
      voxel_size.data_ptr<float>(),
@@ -36,9 +48,15 @@ void hard_voxelize_forward(const at::Tensor &points,
      coors_range.data_ptr<float>(),
      coors_range.data_ptr<float>() + coors_range.numel());

-  *voxel_num_data = hard_voxelize_forward_impl(
-      points, voxels, coors, num_points_per_voxel, voxel_size_v, coors_range_v,
-      max_points, max_voxels, NDim);
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  } else {
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  }
 }

 void dynamic_voxelize_forward(const at::Tensor &points,

--- a/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
@@ -85,7 +85,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
    case 0:
    case 1:
      nthreads = batch_size * channels * width;
-      col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
      top_bottom_pool_kernel<scalar_t>
          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
              input, output, batch_size, channels, height, width, pool_type);
@@ -93,7 +93,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
    case 2:
    case 3:
      nthreads = batch_size * channels * height;
-      col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
      left_right_pool_kernel<scalar_t>
          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
              input, output, batch_size, channels, height, width, pool_type);

--- a/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
@@ -67,7 +67,7 @@ void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
  const int data_size =
      tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];

-  const int col_block = DIVUP(data_size, THREADS_PER_BLOCK);
+  const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);

  cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
      input, output_value, output_index, tensor_desc, cum_dim, cum_type);

--- a/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
@@ -282,7 +282,7 @@ nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
      }
    }

-    if (field_name.compare("deformable_group") == 0) {
+    if (field_name.compare("deform_groups") == 0) {
      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
    }


--- a/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
@@ -254,7 +254,7 @@ nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
    }
    std::string field_name(fc->fields[i].name);

-    if (field_name.compare("deformable_group") == 0) {
+    if (field_name.compare("deform_groups") == 0) {
      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
    }


--- a/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
@@ -114,7 +114,8 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
      mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
  size_t boxes_workspace =
      mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
-  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
  size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
                                               sizeof(unsigned long long));
  size_t index_template_workspace =
@@ -163,7 +164,8 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
                                    int spatial_dimension, int num_classes,
                                    size_t output_length, void* workspace,
                                    cudaStream_t stream) {
-  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
  float* boxes_sorted = (float*)workspace;
  workspace = static_cast<char*>(workspace) +
              mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));

--- a/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
@@ -67,7 +67,7 @@ void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
    num_update_indice *= indice_desc.shape[i];
  }
  // scatter
-  const int col_block = DIVUP(num_update_indice, threadsPerBlock);
+  const int col_block = GET_BLOCKS(num_update_indice, threadsPerBlock);
  onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
      num_update_indice, indices, update, output, tensor_desc, indice_desc);
 }

--- a/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
+++ b/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
@@ -3,8 +3,6 @@
 #define TRT_CUDA_HELPER_HPP
 #include <cublas_v2.h>

-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
 #define cudaCheckError()                                       \
  {                                                            \
    cudaError_t e = cudaGetLastError();                        \

--- a/mmcv/ops/deform_conv.py
+++ b/mmcv/ops/deform_conv.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union

 import torch
 import torch.nn as nn
@@ -48,16 +48,16 @@ class DeformConv2dFunction(Function):

    @staticmethod
    def forward(ctx,
-                input,
-                offset,
-                weight,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                deform_groups=1,
-                bias=False,
-                im2col_step=32):
+                input: Tensor,
+                offset: Tensor,
+                weight: Tensor,
+                stride: Union[int, Tuple[int, ...]] = 1,
+                padding: Union[int, Tuple[int, ...]] = 0,
+                dilation: Union[int, Tuple[int, ...]] = 1,
+                groups: int = 1,
+                deform_groups: int = 1,
+                bias: bool = False,
+                im2col_step: int = 32) -> Tensor:
        if input is not None and input.dim() != 4:
            raise ValueError(
                f'Expected 4D tensor as input, got {input.dim()}D tensor \
@@ -111,7 +111,10 @@ class DeformConv2dFunction(Function):

    @staticmethod
    @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
+               None, None, None, None, None, None]:
        input, offset, weight = ctx.saved_tensors

        grad_input = grad_offset = grad_weight = None
@@ -236,7 +239,7 @@ class DeformConv2d(nn.Module):
                 deform_groups: int = 1,
                 bias: bool = False,
                 im2col_step: int = 32) -> None:
-        super(DeformConv2d, self).__init__()
+        super().__init__()

        assert not bias, \
            f'bias={bias} is not supported in DeformConv2d.'
@@ -356,7 +359,7 @@ class DeformConv2dPack(DeformConv2d):
    _version = 2

    def __init__(self, *args, **kwargs):
-        super(DeformConv2dPack, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
        self.conv_offset = nn.Conv2d(
            self.in_channels,
            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
@@ -371,7 +374,7 @@ class DeformConv2dPack(DeformConv2d):
        self.conv_offset.weight.data.zero_()
        self.conv_offset.bias.data.zero_()

-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore
        offset = self.conv_offset(x)
        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
                             self.dilation, self.groups, self.deform_groups,

--- a/mmcv/ops/deform_roi_pool.py
+++ b/mmcv/ops/deform_roi_pool.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from torch import nn
+from typing import Optional, Tuple
+
+from torch import Tensor, nn
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair
@@ -28,13 +30,13 @@ class DeformRoIPoolFunction(Function):

    @staticmethod
    def forward(ctx,
-                input,
-                rois,
-                offset,
-                output_size,
-                spatial_scale=1.0,
-                sampling_ratio=0,
-                gamma=0.1):
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor],
+                output_size: Tuple[int, ...],
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                gamma: float = 0.1) -> Tensor:
        if offset is None:
            offset = input.new_zeros(0)
        ctx.output_size = _pair(output_size)
@@ -64,7 +66,9 @@ class DeformRoIPoolFunction(Function):

    @staticmethod
    @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, None, Tensor, None, None, None, None]:
        input, rois, offset = ctx.saved_tensors
        grad_input = grad_output.new_zeros(input.shape)
        grad_offset = grad_output.new_zeros(offset.shape)
@@ -92,17 +96,20 @@ deform_roi_pool = DeformRoIPoolFunction.apply
 class DeformRoIPool(nn.Module):

    def __init__(self,
-                 output_size,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(DeformRoIPool, self).__init__()
+                 output_size: Tuple[int, ...],
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__()
        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)
        self.sampling_ratio = int(sampling_ratio)
        self.gamma = float(gamma)

-    def forward(self, input, rois, offset=None):
+    def forward(self,
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor] = None) -> Tensor:
        return deform_roi_pool(input, rois, offset, self.output_size,
                               self.spatial_scale, self.sampling_ratio,
                               self.gamma)
@@ -111,14 +118,13 @@ class DeformRoIPool(nn.Module):
 class DeformRoIPoolPack(DeformRoIPool):

    def __init__(self,
-                 output_size,
-                 output_channels,
-                 deform_fc_channels=1024,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale,
-                                                sampling_ratio, gamma)
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)

        self.output_channels = output_channels
        self.deform_fc_channels = deform_fc_channels
@@ -135,7 +141,7 @@ class DeformRoIPoolPack(DeformRoIPool):
        self.offset_fc[-1].weight.data.zero_()
        self.offset_fc[-1].bias.data.zero_()

-    def forward(self, input, rois):
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
        assert input.size(1) == self.output_channels
        x = deform_roi_pool(input, rois, None, self.output_size,
                            self.spatial_scale, self.sampling_ratio,
@@ -152,14 +158,13 @@ class DeformRoIPoolPack(DeformRoIPool):
 class ModulatedDeformRoIPoolPack(DeformRoIPool):

    def __init__(self,
-                 output_size,
-                 output_channels,
-                 deform_fc_channels=1024,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(ModulatedDeformRoIPoolPack,
-              self).__init__(output_size, spatial_scale, sampling_ratio, gamma)
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)

        self.output_channels = output_channels
        self.deform_fc_channels = deform_fc_channels
@@ -187,7 +192,7 @@ class ModulatedDeformRoIPoolPack(DeformRoIPool):
        self.mask_fc[2].weight.data.zero_()
        self.mask_fc[2].bias.data.zero_()

-    def forward(self, input, rois):
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
        assert input.size(1) == self.output_channels
        x = deform_roi_pool(input, rois, None, self.output_size,
                            self.spatial_scale, self.sampling_ratio,

--- a/mmcv/ops/deprecated_wrappers.py
+++ b/mmcv/ops/deprecated_wrappers.py
@@ -12,7 +12,8 @@ class Conv2d_deprecated(Conv2d):
        super().__init__(*args, **kwargs)
        warnings.warn(
            'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)


 class ConvTranspose2d_deprecated(ConvTranspose2d):
@@ -22,7 +23,7 @@ class ConvTranspose2d_deprecated(ConvTranspose2d):
        warnings.warn(
            'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
            'deprecated in the future. Please import them from "mmcv.cnn" '
-            'instead')
+            'instead', DeprecationWarning)


 class MaxPool2d_deprecated(MaxPool2d):
@@ -31,7 +32,8 @@ class MaxPool2d_deprecated(MaxPool2d):
        super().__init__(*args, **kwargs)
        warnings.warn(
            'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)


 class Linear_deprecated(Linear):
@@ -40,4 +42,5 @@ class Linear_deprecated(Linear):
        super().__init__(*args, **kwargs)
        warnings.warn(
            'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
--- a/mmcv/ops/diff_iou_rotated.py
+++ b/mmcv/ops/diff_iou_rotated.py
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/box_intersection_2d.py  # noqa
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/oriented_iou_loss.py  # noqa
+from typing import Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+EPSILON = 1e-8
+ext_module = ext_loader.load_ext('_ext',
+                                 ['diff_iou_rotated_sort_vertices_forward'])
+
+
+class SortVertices(Function):
+
+    @staticmethod
+    def forward(ctx, vertices, mask, num_valid):
+        idx = ext_module.diff_iou_rotated_sort_vertices_forward(
+            vertices, mask, num_valid)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, gradout):
+        return ()
+
+
+def box_intersection(corners1: Tensor,
+                     corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find intersection points of rectangles.
+    Convention: if two edges are collinear, there is no intersection point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4, 4, 2) Intersections.
+         - Tensor: (B, N, 4, 4) Valid intersections mask.
+    """
+    # build edges from corners
+    # B, N, 4, 4: Batch, Box, edge, point
+    line1 = torch.cat([corners1, corners1[:, :, [1, 2, 3, 0], :]], dim=3)
+    line2 = torch.cat([corners2, corners2[:, :, [1, 2, 3, 0], :]], dim=3)
+    # duplicate data to pair each edges from the boxes
+    # (B, N, 4, 4) -> (B, N, 4, 4, 4) : Batch, Box, edge1, edge2, point
+    line1_ext = line1.unsqueeze(3)
+    line2_ext = line2.unsqueeze(2)
+    x1, y1, x2, y2 = line1_ext.split([1, 1, 1, 1], dim=-1)
+    x3, y3, x4, y4 = line2_ext.split([1, 1, 1, 1], dim=-1)
+    # math: https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection
+    numerator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
+    denumerator_t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)
+    t = denumerator_t / numerator
+    t[numerator == .0] = -1.
+    mask_t = (t > 0) & (t < 1)  # intersection on line segment 1
+    denumerator_u = (x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)
+    u = -denumerator_u / numerator
+    u[numerator == .0] = -1.
+    mask_u = (u > 0) & (u < 1)  # intersection on line segment 2
+    mask = mask_t * mask_u
+    # overwrite with EPSILON. otherwise numerically unstable
+    t = denumerator_t / (numerator + EPSILON)
+    intersections = torch.stack([x1 + t * (x2 - x1), y1 + t * (y2 - y1)],
+                                dim=-1)
+    intersections = intersections * mask.float().unsqueeze(-1)
+    return intersections, mask
+
+
+def box1_in_box2(corners1: Tensor, corners2: Tensor) -> Tensor:
+    """Check if corners of box1 lie in box2.
+    Convention: if a corner is exactly on the edge of the other box,
+    it's also a valid point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tensor: (B, N, 4) Intersection.
+    """
+    # a, b, c, d - 4 vertices of box2
+    a = corners2[:, :, 0:1, :]  # (B, N, 1, 2)
+    b = corners2[:, :, 1:2, :]  # (B, N, 1, 2)
+    d = corners2[:, :, 3:4, :]  # (B, N, 1, 2)
+    # ab, am, ad - vectors between corresponding vertices
+    ab = b - a  # (B, N, 1, 2)
+    am = corners1 - a  # (B, N, 4, 2)
+    ad = d - a  # (B, N, 1, 2)
+    prod_ab = torch.sum(ab * am, dim=-1)  # (B, N, 4)
+    norm_ab = torch.sum(ab * ab, dim=-1)  # (B, N, 1)
+    prod_ad = torch.sum(ad * am, dim=-1)  # (B, N, 4)
+    norm_ad = torch.sum(ad * ad, dim=-1)  # (B, N, 1)
+    # NOTE: the expression looks ugly but is stable if the two boxes
+    # are exactly the same also stable with different scale of bboxes
+    cond1 = (prod_ab / norm_ab > -1e-6) * (prod_ab / norm_ab < 1 + 1e-6
+                                           )  # (B, N, 4)
+    cond2 = (prod_ad / norm_ad > -1e-6) * (prod_ad / norm_ad < 1 + 1e-6
+                                           )  # (B, N, 4)
+    return cond1 * cond2
+
+
+def box_in_box(corners1: Tensor, corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Check if corners of two boxes lie in each other.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4) True if i-th corner of box1 is in box2.
+         - Tensor: (B, N, 4) True if i-th corner of box2 is in box1.
+    """
+    c1_in_2 = box1_in_box2(corners1, corners2)
+    c2_in_1 = box1_in_box2(corners2, corners1)
+    return c1_in_2, c2_in_1
+
+
+def build_vertices(corners1: Tensor, corners2: Tensor, c1_in_2: Tensor,
+                   c2_in_1: Tensor, intersections: Tensor,
+                   valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find vertices of intersection area.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+        c1_in_2 (Tensor): (B, N, 4) True if i-th corner of box1 is in box2.
+        c2_in_1 (Tensor): (B, N, 4) True if i-th corner of box2 is in box1.
+        intersections (Tensor): (B, N, 4, 4, 2) Intersections.
+        valid_mask (Tensor): (B, N, 4, 4) Valid intersections mask.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 24, 2) Vertices of intersection area;
+               only some elements are valid.
+         - Tensor: (B, N, 24) Mask of valid elements in vertices.
+    """
+    # NOTE: inter has elements equals zero and has zeros gradient
+    # (masked by multiplying with 0); can be used as trick
+    B = corners1.size()[0]
+    N = corners1.size()[1]
+    # (B, N, 4 + 4 + 16, 2)
+    vertices = torch.cat(
+        [corners1, corners2,
+         intersections.view([B, N, -1, 2])], dim=2)
+    # Bool (B, N, 4 + 4 + 16)
+    mask = torch.cat([c1_in_2, c2_in_1, valid_mask.view([B, N, -1])], dim=2)
+    return vertices, mask
+
+
+def sort_indices(vertices: Tensor, mask: Tensor) -> Tensor:
+    """Sort indices.
+    Note:
+        why 9? the polygon has maximal 8 vertices.
+        +1 to duplicate the first element.
+        the index should have following structure:
+            (A, B, C, ... , A, X, X, X)
+        and X indicates the index of arbitrary elements in the last
+        16 (intersections not corners) with value 0 and mask False.
+        (cause they have zero value and zero gradient)
+
+    Args:
+        vertices (Tensor): (B, N, 24, 2) Box vertices.
+        mask (Tensor): (B, N, 24) Mask.
+
+    Returns:
+        Tensor: (B, N, 9) Sorted indices.
+
+    """
+    num_valid = torch.sum(mask.int(), dim=2).int()  # (B, N)
+    mean = torch.sum(
+        vertices * mask.float().unsqueeze(-1), dim=2,
+        keepdim=True) / num_valid.unsqueeze(-1).unsqueeze(-1)
+    vertices_normalized = vertices - mean  # normalization makes sorting easier
+    return SortVertices.apply(vertices_normalized, mask, num_valid).long()
+
+
+def calculate_area(idx_sorted: Tensor,
+                   vertices: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate area of intersection.
+
+    Args:
+        idx_sorted (Tensor): (B, N, 9) Sorted vertex ids.
+        vertices (Tensor): (B, N, 24, 2) Vertices.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor: (B, N, 9, 2) Vertices of polygon with zero padding.
+    """
+    idx_ext = idx_sorted.unsqueeze(-1).repeat([1, 1, 1, 2])
+    selected = torch.gather(vertices, 2, idx_ext)
+    total = selected[:, :, 0:-1, 0] * selected[:, :, 1:, 1] \
+        - selected[:, :, 0:-1, 1] * selected[:, :, 1:, 0]
+    total = torch.sum(total, dim=2)
+    area = torch.abs(total) / 2
+    return area, selected
+
+
+def oriented_box_intersection_2d(corners1: Tensor,
+                                 corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate intersection area of 2d rotated boxes.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor (B, N, 9, 2): Vertices of polygon with zero padding.
+    """
+    intersections, valid_mask = box_intersection(corners1, corners2)
+    c12, c21 = box_in_box(corners1, corners2)
+    vertices, mask = build_vertices(corners1, corners2, c12, c21,
+                                    intersections, valid_mask)
+    sorted_indices = sort_indices(vertices, mask)
+    return calculate_area(sorted_indices, vertices)
+
+
+def box2corners(box: Tensor) -> Tensor:
+    """Convert rotated 2d box coordinate to corners.
+
+    Args:
+        box (Tensor): (B, N, 5) with x, y, w, h, alpha.
+
+    Returns:
+        Tensor: (B, N, 4, 2) Corners.
+    """
+    B = box.size()[0]
+    x, y, w, h, alpha = box.split([1, 1, 1, 1, 1], dim=-1)
+    x4 = torch.FloatTensor([0.5, -0.5, -0.5, 0.5]).to(box.device)
+    x4 = x4 * w  # (B, N, 4)
+    y4 = torch.FloatTensor([0.5, 0.5, -0.5, -0.5]).to(box.device)
+    y4 = y4 * h  # (B, N, 4)
+    corners = torch.stack([x4, y4], dim=-1)  # (B, N, 4, 2)
+    sin = torch.sin(alpha)
+    cos = torch.cos(alpha)
+    row1 = torch.cat([cos, sin], dim=-1)
+    row2 = torch.cat([-sin, cos], dim=-1)  # (B, N, 2)
+    rot_T = torch.stack([row1, row2], dim=-2)  # (B, N, 2, 2)
+    rotated = torch.bmm(corners.view([-1, 4, 2]), rot_T.view([-1, 2, 2]))
+    rotated = rotated.view([B, -1, 4, 2])  # (B * N, 4, 2) -> (B, N, 4, 2)
+    rotated[..., 0] += x
+    rotated[..., 1] += y
+    return rotated
+
+
+def diff_iou_rotated_2d(box1: Tensor, box2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 2d boxes.
+
+    Args:
+        box1 (Tensor): (B, N, 5) First box.
+        box2 (Tensor): (B, N, 5) Second box.
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1,
+                                                   corners2)  # (B, N)
+    area1 = box1[:, :, 2] * box1[:, :, 3]
+    area2 = box2[:, :, 2] * box2[:, :, 3]
+    union = area1 + area2 - intersection
+    iou = intersection / union
+    return iou
+
+
+def diff_iou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 3d boxes.
+
+    Args:
+        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).
+        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    box1 = box3d1[..., [0, 1, 3, 4, 6]]  # 2d box
+    box2 = box3d2[..., [0, 1, 3, 4, 6]]
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1, corners2)
+    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5
+    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5
+    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5
+    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5
+    z_overlap = (torch.min(zmax1, zmax2) -
+                 torch.max(zmin1, zmin2)).clamp_(min=0.)
+    intersection_3d = intersection * z_overlap
+    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]
+    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]
+    union_3d = volume1 + volume2 - intersection_3d
+    return intersection_3d / union_3d
--- a/mmcv/ops/focal_loss.py
+++ b/mmcv/ops/focal_loss.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -15,7 +17,9 @@ ext_module = ext_loader.load_ext('_ext', [
 class SigmoidFocalLossFunction(Function):

    @staticmethod
-    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+    def symbolic(g, input: torch.Tensor, target: torch.LongTensor,
+                 gamma: float, alpha: float, weight: torch.Tensor,
+                 reduction: str):
        return g.op(
            'mmcv::MMCVSigmoidFocalLoss',
            input,
@@ -27,14 +31,15 @@ class SigmoidFocalLossFunction(Function):

    @staticmethod
    def forward(ctx,
-                input,
-                target,
-                gamma=2.0,
-                alpha=0.25,
-                weight=None,
-                reduction='mean'):
-
-        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction: str = 'mean') -> torch.Tensor:
+
+        assert isinstance(
+            target, (torch.Tensor, torch.LongTensor, torch.cuda.LongTensor))
        assert input.dim() == 2
        assert target.dim() == 1
        assert input.size(0) == target.size(0)
@@ -63,7 +68,7 @@ class SigmoidFocalLossFunction(Function):

    @staticmethod
    @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
        input, target, weight = ctx.saved_tensors

        grad_input = input.new_zeros(input.size())
@@ -87,14 +92,22 @@ sigmoid_focal_loss = SigmoidFocalLossFunction.apply

 class SigmoidFocalLoss(nn.Module):

-    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
-        super(SigmoidFocalLoss, self).__init__()
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.register_buffer('weight', weight)
        self.reduction = reduction

-    def forward(self, input, target):
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
                                  self.weight, self.reduction)

@@ -109,7 +122,9 @@ class SigmoidFocalLoss(nn.Module):
 class SoftmaxFocalLossFunction(Function):

    @staticmethod
-    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+    def symbolic(g, input: torch.Tensor, target: torch.LongTensor,
+                 gamma: float, alpha: float, weight: torch.Tensor,
+                 reduction: str):
        return g.op(
            'mmcv::MMCVSoftmaxFocalLoss',
            input,
@@ -121,12 +136,12 @@ class SoftmaxFocalLossFunction(Function):

    @staticmethod
    def forward(ctx,
-                input,
-                target,
-                gamma=2.0,
-                alpha=0.25,
-                weight=None,
-                reduction='mean'):
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction='mean') -> torch.Tensor:

        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
        assert input.dim() == 2
@@ -168,7 +183,7 @@ class SoftmaxFocalLossFunction(Function):
        return output

    @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
        input_softmax, target, weight = ctx.saved_tensors
        buff = input_softmax.new_zeros(input_softmax.size(0))
        grad_input = input_softmax.new_zeros(input_softmax.size())
@@ -193,14 +208,22 @@ softmax_focal_loss = SoftmaxFocalLossFunction.apply

 class SoftmaxFocalLoss(nn.Module):

-    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
-        super(SoftmaxFocalLoss, self).__init__()
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.register_buffer('weight', weight)
        self.reduction = reduction

-    def forward(self, input, target):
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
        return softmax_focal_loss(input, target, self.gamma, self.alpha,
                                  self.weight, self.reduction)